From f5a87a4ab02bc15dbd07c366d916c7350a9cad29 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 17 Jan 2023 20:00:29 -0800 Subject: [PATCH 01/76] WIP --- .../src/BucketizedBufferAllocator.cpp | 36 +- .../src/BucketizedBufferAllocator.h | 22 +- .../src/DmlBufferRegion.cc | 120 +++++++ .../src/DmlBufferRegion.h | 82 +++++ .../src/DmlCommandRecorder.cpp | 28 +- .../src/DmlCommittedResourceWrapper.h | 4 +- .../src/DmlHeapAllocator.cpp | 317 ++++++++++++++++++ .../src/DmlHeapAllocator.h | 134 ++++++++ .../src/DmlReservedResourceWrapper.h | 21 ++ .../src/DmlResourceWrapper.h | 4 +- .../src/DmlTaggedPointer.cpp | 33 ++ .../src/DmlTaggedPointer.h | 34 ++ .../src/ExecutionProvider.cpp | 98 ++++-- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 10 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 4 +- 15 files changed, 878 insertions(+), 69 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 588c4ac391023..08bffae6e8a5b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -37,10 +37,6 @@ namespace Dml BucketizedBufferAllocator::BucketizedBufferAllocator( ID3D12Device* device, std::shared_ptr context, - const D3D12_HEAP_PROPERTIES& heapProps, - D3D12_HEAP_FLAGS heapFlags, - D3D12_RESOURCE_FLAGS resourceFlags, - D3D12_RESOURCE_STATES initialState, std::unique_ptr&& subAllocator ) : onnxruntime::IAllocator( @@ -51,10 +47,6 @@ namespace Dml ) ), m_device(device), - m_heapProperties(heapProps), - m_heapFlags(heapFlags), - m_resourceFlags(resourceFlags), - m_initialState(initialState), m_context(context), m_subAllocator(std::move(subAllocator)) { @@ -133,7 +125,7 @@ namespace Dml resourceId = ++m_currentResourceId; } - assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize); + assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize); assert(resourceWrapper != nullptr); ComPtr allocInfo = wil::MakeOrThrow( @@ -174,7 +166,7 @@ namespace Dml // Free the resource to the pool if its size matches a bucket size gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); - if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width) + if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResourceInUavState()->GetDesc().Width) { assert(gsl::narrow_cast(m_pool.size()) > bucketIndex); @@ -188,9 +180,29 @@ namespace Dml { // Free the underlying allocation once queued work has completed. #ifdef _GAMING_XBOX - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResource()).Get()); + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInUavState()).Get()); + + if (allocInfo->GetResourceInCopySrcState() != nullptr) + { + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopySrcState()).Get()); + } + + if (allocInfo->GetResourceInCopyDstState() != nullptr) + { + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopyDstState()).Get()); + } #else - m_context->QueueReference(allocInfo->GetResource()); + m_context->QueueReference(allocInfo->GetResourceInUavState()); + + if (allocInfo->GetResourceInCopySrcState() != nullptr) + { + m_context->QueueReference(allocInfo->GetResourceInCopySrcState()); + } + + if (allocInfo->GetResourceInCopyDstState() != nullptr) + { + m_context->QueueReference(allocInfo->GetResourceInCopyDstState()); + } #endif allocInfo->DetachResourceWrapper(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 7e3471e276c0d..3d95bd029aad8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -46,9 +46,19 @@ namespace Dml return m_owner; } - ID3D12Resource* GetResource() const + ID3D12Resource* GetResourceInUavState() const { - return m_resourceWrapper->GetD3D12Resource(); + return m_resourceWrapper->GetResourceInUavState(); + } + + ID3D12Resource* GetResourceInCopySrcState() const + { + return m_resourceWrapper->GetResourceInCopySrcState(); + } + + ID3D12Resource* GetResourceInCopyDstState() const + { + return m_resourceWrapper->GetResourceInCopyDstState(); } ComPtr DetachResourceWrapper() const @@ -95,10 +105,6 @@ namespace Dml BucketizedBufferAllocator( ID3D12Device* device, std::shared_ptr context, - const D3D12_HEAP_PROPERTIES& heapProps, - D3D12_HEAP_FLAGS heapFlags, - D3D12_RESOURCE_FLAGS resourceFlags, - D3D12_RESOURCE_STATES initialState, std::unique_ptr&& subAllocator); // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc. @@ -141,10 +147,6 @@ namespace Dml void FreeResource(void* p, uint64_t resourceId); ComPtr m_device; - D3D12_HEAP_PROPERTIES m_heapProperties; - D3D12_HEAP_FLAGS m_heapFlags; - D3D12_RESOURCE_FLAGS m_resourceFlags; - D3D12_RESOURCE_STATES m_initialState; std::vector m_pool; size_t m_currentAllocationId = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc new file mode 100644 index 0000000000000..8d6fbd0551083 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc @@ -0,0 +1,120 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlBufferRegion.h" + +namespace Dml +{ + +D3D12BufferRegion::D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state) + : resource_uav_state_(resource_uav_state), + resource_copy_src_state_(resource_copy_src_state), + resource_copy_dst_state_(resource_copy_dst_state), + offset_(offset), + size_in_bytes_(size_in_bytes) +{ + // Get a raw pointer to the first non-null resource passed in. At least one + // resource must be provided. + first_valid_resource_ = resource_uav_state_; + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_src_state_; + } + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_dst_state_; + } + ORT_THROW_HR_IF(E_UNEXPECTED, first_valid_resource_ == nullptr); + + // Regions cannot be empty. + ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ == 0); + + // Regions cannot extend beyond the size of the resource. + uint64_t buffer_size = first_valid_resource_->GetDesc().Width; + ORT_THROW_HR_IF(E_UNEXPECTED, offset_ >= buffer_size); + ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ > buffer_size - offset); + + // All three resources, if provided, must be identical aside from state. + assert( + first_valid_resource_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER); + assert( + !resource_uav_state || + (resource_uav_state->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_uav_state->GetDesc().Width == buffer_size)); + assert( + !resource_copy_src_state_ || + (resource_copy_src_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_src_state_->GetDesc().Width == buffer_size)); + assert( + !resource_copy_dst_state_ || + (resource_copy_dst_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_dst_state_->GetDesc().Width == buffer_size)); +} + +D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) +{ + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->offset_, that.offset_); + std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); +} + +D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) +{ + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->offset_, that.offset_); + std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); + return *this; +} + +ID3D12Resource* D3D12BufferRegion::GetResourceInUavState() const +{ + return resource_uav_state_; +} + +ID3D12Resource* D3D12BufferRegion::GetResourceInCopySrcState() const +{ + return resource_copy_src_state_; +} + +ID3D12Resource* D3D12BufferRegion::GetResourceInCopyDstState() const +{ + return resource_copy_dst_state_; +} + +uint64_t D3D12BufferRegion::Offset() const +{ + return first_valid_resource_ ? offset_ : 0; +} + +uint64_t D3D12BufferRegion::SizeInBytes() const +{ + return first_valid_resource_ ? size_in_bytes_ : 0; +} + +DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const +{ + if (!resource_uav_state_) + { + return DML_BUFFER_BINDING{}; + } + + return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; +} + +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h new file mode 100644 index 0000000000000..f8c1033261c56 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace Dml +{ + +class D3D12HeapAllocator; + +// Represents a region of a D3D12 buffer resource. A buffer region has an +// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in +// bytes from the beginning of that buffer, and a size in bytes of the region. +class D3D12BufferRegion +{ + public: + D3D12BufferRegion() = default; + + // References a region of a buffer. The respective ID3D12Resource objects + // must be in the appropriate states. Each resource is optional, but if more + // than one are provided they must map to the same region of memory. + D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state); + + // Move-only + D3D12BufferRegion(const D3D12BufferRegion&) = delete; + D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete; + D3D12BufferRegion(D3D12BufferRegion&&); + D3D12BufferRegion& operator=(D3D12BufferRegion&&); + + ID3D12Resource* GetResourceInUavState() const; + + // NOTE: may be any state that is valid as a copy source (COPY_SRC, + // GENERIC_READ, or COMMON). + ID3D12Resource* GetResourceInCopySrcState() const; + + ID3D12Resource* GetResourceInCopyDstState() const; + + uint64_t Offset() const; + uint64_t SizeInBytes() const; + + DML_BUFFER_BINDING GetBufferBinding() const; + + explicit operator bool() const { return first_valid_resource_ != nullptr; } + + // Creates a subregion at an offset from the start of this region. If no + // size is provided the region runs to the end of the current region. + inline D3D12BufferRegion Subregion( + uint64_t offset, + uint64_t size_in_bytes = 0) const + { + // start of subregion must be within current region + ORT_THROW_HR_IF(E_UNEXPECTED, offset >= size_in_bytes_); + size_in_bytes = + size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + // end of subregion must be within current region + ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes > size_in_bytes_ - offset); + + return D3D12BufferRegion( + offset_ + offset, + size_in_bytes, + resource_uav_state_, + resource_copy_src_state_, + resource_copy_dst_state_); + } + + private: + ID3D12Resource* resource_uav_state_ = nullptr; + ID3D12Resource* resource_copy_src_state_ = nullptr; + ID3D12Resource* resource_copy_dst_state_ = nullptr; + uint64_t offset_ = 0; + uint64_t size_in_bytes_ = 0; + + // Pointer to the first resource above that isn't null. + ID3D12Resource* first_valid_resource_ = nullptr; +}; + +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 59ceecdc884d2..bd6a5c6b7aa17 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -10,7 +10,7 @@ using namespace Dml; DmlCommandRecorder::DmlCommandRecorder( ID3D12Device* d3dDevice, - IDMLDevice* dmlDevice, + IDMLDevice* dmlDevice, std::shared_ptr commandQueue) : m_queue(std::move(commandQueue)), m_d3dDevice(d3dDevice), @@ -67,7 +67,7 @@ void DmlCommandRecorder::InitializeOperator( ORT_THROW_HR(E_OUTOFMEMORY); } - ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResource(); + ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState(); allocator->Free(tempResourceHandle); // Bind the temporary resource. @@ -143,7 +143,7 @@ void DmlCommandRecorder::ExecuteOperator( ORT_THROW_HR(E_OUTOFMEMORY); } - ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResource(); + ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState(); allocator->Free(tempResourceHandle); // Bind the temporary resource. @@ -183,7 +183,7 @@ void DmlCommandRecorder::CopyBufferRegion( m_currentCommandList->CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount); m_operationsRecordedInCurrentCommandList = true; } - + void DmlCommandRecorder::FillBufferWithPattern( ID3D12Resource* dstBuffer, gsl::span value /* Data type agnostic value, treated as raw bits */) @@ -250,11 +250,11 @@ void DmlCommandRecorder::ExecuteCommandList( _Outptr_ ID3D12Fence** fence, _Out_ uint64_t* completionValue ) -{ +{ ORT_THROW_IF_FAILED(m_currentCommandList->Close()); if (m_operationsRecordedInCurrentCommandList) - { + { m_pendingCommandLists.push_back(m_currentCommandList.Get()); m_pendingCommandListsCacheable.push_back(true); } @@ -290,16 +290,16 @@ void DmlCommandRecorder::ExecuteCommandList( } ComPtr DmlCommandRecorder::GetCommandList() -{ +{ // Assume operations are added by the caller after this returns - m_operationsRecordedInCurrentCommandList = true; - return m_currentCommandList; + m_operationsRecordedInCurrentCommandList = true; + return m_currentCommandList; } void DmlCommandRecorder::ResourceBarrier(gsl::span barriers) { m_currentCommandList->ResourceBarrier(gsl::narrow_cast(barriers.size()), barriers.data()); - m_operationsRecordedInCurrentCommandList = true; + m_operationsRecordedInCurrentCommandList = true; } void DmlCommandRecorder::AddUAVBarrier() @@ -307,7 +307,7 @@ void DmlCommandRecorder::AddUAVBarrier() #pragma warning(suppress: 6387) auto barrier = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); m_currentCommandList->ResourceBarrier(1, &barrier); - m_operationsRecordedInCurrentCommandList = true; + m_operationsRecordedInCurrentCommandList = true; } void DmlCommandRecorder::Open() @@ -323,7 +323,7 @@ void DmlCommandRecorder::Open() m_queue->GetType(), allocator, nullptr, - IID_GRAPHICS_PPV_ARGS(m_currentCommandList.ReleaseAndGetAddressOf()))); + IID_GRAPHICS_PPV_ARGS(m_currentCommandList.ReleaseAndGetAddressOf()))); } else { @@ -338,7 +338,7 @@ void DmlCommandRecorder::CloseAndExecute() ORT_THROW_IF_FAILED(m_currentCommandList->Close()); if (m_operationsRecordedInCurrentCommandList) - { + { m_pendingCommandLists.push_back(m_currentCommandList.Get()); m_pendingCommandListsCacheable.push_back(true); } @@ -386,4 +386,4 @@ void DmlCommandRecorder::SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap) ID3D12DescriptorHeap* descriptorHeaps[] = { descriptorHeap }; m_currentCommandList->SetDescriptorHeaps(ARRAYSIZE(descriptorHeaps), descriptorHeaps); } -} \ No newline at end of file +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index cae206b569170..e86ca4b52b4f2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -9,7 +9,9 @@ namespace Dml { public: DmlCommittedResourceWrapper(ComPtr&& d3d12Resource) : m_d3d12Resource(std::move(d3d12Resource)) {} - ID3D12Resource* GetD3D12Resource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetResourceInUavState() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetResourceInCopySrcState() const final { return nullptr; } + ID3D12Resource* GetResourceInCopyDstState() const final { return nullptr; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp new file mode 100644 index 0000000000000..f56312b8ea2cf --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp @@ -0,0 +1,317 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlHeapAllocator.h" +#include "DmlTaggedPointer.h" +#include "DmlBufferRegion.h" +#include "DmlReservedResourceWrapper.h" + +namespace Dml +{ + +static bool GetTilingEnabled(ID3D12Device* device) +{ + D3D12_FEATURE_DATA_D3D12_OPTIONS options = {}; + if (SUCCEEDED(device->CheckFeatureSupport( + D3D12_FEATURE_D3D12_OPTIONS, + &options, + sizeof(options)))) + { + return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1; + } + + return false; +} + +static uint64_t GetMaxHeapSizeInTiles() +{ + return D3D12HeapAllocator::kDefaultMaxHeapSizeInTiles; +} + +D3D12HeapAllocator::D3D12HeapAllocator( + ID3D12Device* device, + ID3D12CommandQueue* queue, + const D3D12_HEAP_PROPERTIES& heap_props, + D3D12_HEAP_FLAGS heap_flags, + D3D12_RESOURCE_FLAGS resource_flags, + D3D12_RESOURCE_STATES initial_state) + : device_(device), + queue_(queue), + heap_properties_(heap_props), + heap_flags_(heap_flags), + resource_flags_(resource_flags), + initial_state_(initial_state), + tiling_enabled_(GetTilingEnabled(device)), + max_heap_size_in_tiles_(GetMaxHeapSizeInTiles()) +{ +} + +absl::optional D3D12HeapAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes) +{ + Allocation allocation = {}; + + // The allocation may be larger than the requested size to ensure a whole + // number of tiles. + const uint64_t resource_size_in_tiles = + 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resource_size_in_bytes = + resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + auto resource_desc = + CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_); + + ID3D12Resource** resources[] = { + &allocation.resource_uav_state, + &allocation.resource_copy_src_state, + &allocation.resource_copy_dst_state}; + + D3D12_RESOURCE_STATES states[] = { + initial_state_, + D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COPY_DEST}; + + for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) + { + HRESULT create_resource_hr = device_->CreateReservedResource( + &resource_desc, + states[i], + nullptr, + IID_PPV_ARGS(resources[i])); + + if (create_resource_hr == E_OUTOFMEMORY) + { + return absl::nullopt; + } + ORT_THROW_IF_FAILED(create_resource_hr); + } + + // Reserve enough heaps to store all tiles in the resource. + const uint64_t heap_count = + 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_; + allocation.heaps.resize(heap_count); + + // Create heaps and map them to the primary reserved resource. + D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {}; + uint64_t unmapped_resource_tiles = resource_size_in_tiles; + for (uint64_t i = 0; i < heap_count; i++) + { + // Create heap. The last heap of the allocation may have fewer tiles to + // avoid wasting space. + uint64_t heap_size_in_tiles = std::min( + unmapped_resource_tiles, + max_heap_size_in_tiles_); + uint64_t heap_size_in_bytes = + heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + auto heap_desc = CD3DX12_HEAP_DESC( + heap_size_in_bytes, + heap_properties_, + 0, + heap_flags_); + + HRESULT create_heap_hr = + device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); + if (create_heap_hr == E_OUTOFMEMORY) + { + return absl::nullopt; + } + ORT_THROW_IF_FAILED(create_heap_hr); + + // Source region in the resource to map. + D3D12_TILE_REGION_SIZE resource_region_size = {}; + resource_region_size.NumTiles = static_cast(heap_size_in_tiles); + + // Target range in the current heap to map. + const D3D12_TILE_RANGE_FLAGS tile_range_flags = + D3D12_TILE_RANGE_FLAG_NONE; + const uint32_t heap_range_start_offset = 0; + const uint32_t heap_range_tile_count = static_cast(heap_size_in_tiles); + + constexpr uint32_t numResourceRegions = 1; + constexpr uint32_t numHeapRanges = 1; + + // This is a brand new allocation/resource, so the tile mappings are + // guaranteed to be set (on the GPU timeline) by the time any code can + // reference the returned resource. We only execute operations on a + // single hardware queue so there is no need to wait or signal. + // + // All resources have identical tile mappings. The repeated call to + // UpdateTileMappings on all resources instead of using CopyTileMappings + // is intentional: the latter API is not supported by all versions of + // PIX. + for (auto resource : + {allocation.resource_uav_state.Get(), + allocation.resource_copy_src_state.Get(), + allocation.resource_copy_dst_state.Get()}) + { + queue_->UpdateTileMappings( + resource, + numResourceRegions, + &resource_region_start_coordinates, + &resource_region_size, + allocation.heaps[i].Get(), + numHeapRanges, + &tile_range_flags, + &heap_range_start_offset, + &heap_range_tile_count, + D3D12_TILE_MAPPING_FLAG_NONE); + } + + resource_region_start_coordinates.X += static_cast(heap_size_in_tiles); + unmapped_resource_tiles -= heap_size_in_tiles; + } + + assert(unmapped_resource_tiles == 0); + + return allocation; +} + +absl::optional D3D12HeapAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes) +{ + Allocation allocation = {}; + + // Create the allocation's sole heap. The allocation may be larger than the + // requested size to ensure a whole number of tiles. + allocation.heaps.resize(1); + D3D12_HEAP_DESC heap_desc = + CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_); + HRESULT create_heap_hr = device_->CreateHeap( + &heap_desc, + IID_PPV_ARGS(&allocation.heaps.front())); + if (create_heap_hr == E_OUTOFMEMORY) + { + return absl::nullopt; + } + + // Create large placed resource that spans the heap. + D3D12_RESOURCE_DESC resource_desc = + CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_); + + ID3D12Resource** resources[] = { + &allocation.resource_uav_state, + &allocation.resource_copy_src_state, + &allocation.resource_copy_dst_state}; + D3D12_RESOURCE_STATES states[] = { + initial_state_, + D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COPY_DEST}; + + for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) + { + HRESULT create_resource_hr = device_->CreatePlacedResource( + allocation.heaps.front().Get(), + 0, + &resource_desc, + states[i], + nullptr, + IID_PPV_ARGS(resources[i])); + if (create_resource_hr == E_OUTOFMEMORY) + { + return absl::nullopt; + } + ORT_THROW_IF_FAILED(create_resource_hr); + } + + return allocation; +} + +Microsoft::WRL::ComPtr D3D12HeapAllocator::Alloc(size_t size_in_bytes) +{ + if (size_in_bytes == 0) + { + return nullptr; + } + + // The D3D12 device is thread-safe so we don't need to hold the lock while + // creating an allocation. + absl::optional allocation = + tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes) + : TryCreateUntiledAllocation(size_in_bytes); + + ORT_THROW_HR_IF(E_UNEXPECTED, !allocation); + + auto reservedResourceWrapper = wil::MakeOrThrow(std::move(*allocation)); + Microsoft::WRL::ComPtr resourceWrapper; + reservedResourceWrapper.As(&resourceWrapper); + return resourceWrapper; +} + +void D3D12HeapAllocator::Free(void* ptr, uint64_t size_in_bytes) +{ + ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr); + + TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); + ORT_THROW_HR_IF(E_UNEXPECTED, tagged_ptr.offset != 0); + + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + + ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end()); + + ReleaseAllocationID(tagged_ptr.allocation_id); + + // Frees the ID3D12Heap + allocations_by_id_.erase(it); +} + +D3D12BufferRegion D3D12HeapAllocator::CreateBufferRegion( + const void* ptr, + uint64_t size_in_bytes) +{ + ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr); + + TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); + + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + // Find the allocation corresponding to this pointer + auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end()); + + Allocation* allocation = &it->second; + + return D3D12BufferRegion( + tagged_ptr.offset, + size_in_bytes, + allocation->resource_uav_state.Get(), + allocation->resource_copy_src_state.Get(), + allocation->resource_copy_dst_state.Get()); +} + +absl::optional D3D12HeapAllocator::TryReserveAllocationID() +{ + // The mutex must already be held + assert(!mutex_.try_lock()); + + if (!free_allocation_ids_.empty()) + { + // Return a free ID from the pool + uint32_t id = free_allocation_ids_.back(); + free_allocation_ids_.pop_back(); + return id; + } + + static constexpr uint32_t kMaxAllocationID = + (1 << TaggedPointer::kAllocationIDBits) - 1; + if (current_allocation_id_ == kMaxAllocationID) + { + // We've reached the maximum number of allocations! + return absl::nullopt; + } + + ++current_allocation_id_; + return current_allocation_id_; +} + +void D3D12HeapAllocator::ReleaseAllocationID(uint32_t id) +{ + // The mutex must already be held + assert(!mutex_.try_lock()); + + // Add it to the pool of free IDs + free_allocation_ids_.push_back(id); +} + +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h new file mode 100644 index 0000000000000..877e4b34be6ac --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h @@ -0,0 +1,134 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "absl/container/flat_hash_map.h" +#include "DmlSubAllocator.h" +#include "DmlBufferRegion.h" + +namespace Dml +{ + +struct Allocation +{ + Microsoft::WRL::ComPtr heap; + + // Heaps backing the memory for the allocation. If tiling is supported + // an allocation may comprise multiple heaps. If tiling is not supported + // an allocation will only have a single heap. + std::vector> heaps; + + // Resources created over this allocation's heaps. All three resources + // are identical aside from being fixed in a single resource state: UAV, + // COPY_SRC, and COPY_DST respectively. The purpose of duplicate + // resources is to enable overlapping resources in different states for + // copying data. Most callers will not (and should not) interact + // directly with these resources; all three are wrapped by the buffer + // regions returned from this allocator, and the appropriate resource + // will be used automatically when performing buffer copies. + Microsoft::WRL::ComPtr resource_uav_state; + Microsoft::WRL::ComPtr resource_copy_src_state; + Microsoft::WRL::ComPtr resource_copy_dst_state; +}; + +// An allocator that makes logically contiguous allocations backed by D3D heaps. +// +// Heaps must fit entirely in either local or non-local memory. Larger heaps +// have a greater chance of getting demoted into non-local memory, which can be +// disastrous for performance. This problem is compounded by the fact that heaps +// may be demoted even if overall local memory usage is within the process' +// budget. Heaps are not necessarily mappable to discontiguous regions of +// physical memory, which means physical memory fragmentation *may* make it +// extremely difficult to accommodate larger heaps. +// +// On D3D hardware that supports tiled resource tier 1+ this class implements +// large allocations through tiling. Each allocation is backed by however many +// small heaps are necessary to cover the requested allocation size. Buffer +// regions retrieved through this allocator are reserved resources that span the +// full collection of heaps assigned to an individual allocation. Tile mappings +// are static. +// +// On hardware that doesn't support tiled resources each allocation is backed by +// a single heap. Buffer regions retrieved through this allocator are placed +// resources that span the full heap assigned to an individual allocation. In +// this case it is better make more but smaller allocations (resulting in +// smaller heaps); this fallback path is only retained as a last resort for +// older hardware. +class D3D12HeapAllocator : public DmlSubAllocator +{ + public: + // Maximum size of a heap (in tiles) when allocations are tiled. Each tile + // is 64KB. A default size of 512 tiles (32MB) does a good job of handling + // local video memory fragmentation without requiring lots of heaps. + static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512; + + // The largest single allocation supported by this allocator. We use 4GB + // minus a MB to avoid edge cases in hw/drivers that aren't expecting such + // large allocations. + static constexpr uint64_t kDefaultMaxAllocationSizeInBytes = + (1ull << 32) - (1ull << 20); + + D3D12HeapAllocator( + ID3D12Device* device, + ID3D12CommandQueue* queue, + const D3D12_HEAP_PROPERTIES& heap_props, + D3D12_HEAP_FLAGS heap_flags, + D3D12_RESOURCE_FLAGS resource_flags, + D3D12_RESOURCE_STATES initial_state); + + // Creates a reserved or placed resource buffer over the given memory range. + // The physical D3D12 resource may be larger than the requested size, so + // callers must ensure to use the offset/size returned in the + // D3D12BufferRegion else risk out of bounds access. Note that in practice + // the ID3D12Resource is cached, so this call typically has a lower cost + // than a call to ID3D12Device::CreatePlacedResource or + // CreateReservedResource. + D3D12BufferRegion CreateBufferRegion( + const void* ptr, + uint64_t size_in_bytes); + + Microsoft::WRL::ComPtr Alloc(size_t size_in_bytes) final; + void Free(void* ptr, uint64_t size_in_bytes); + bool TilingEnabled() const { return tiling_enabled_; }; + + private: + std::mutex mutex_; + + Microsoft::WRL::ComPtr device_; + Microsoft::WRL::ComPtr queue_; + const D3D12_HEAP_PROPERTIES heap_properties_; + const D3D12_HEAP_FLAGS heap_flags_; + const D3D12_RESOURCE_FLAGS resource_flags_; + const D3D12_RESOURCE_STATES initial_state_; + bool tiling_enabled_; + uint64_t max_heap_size_in_tiles_; + + // The largest allocation ID we've returned so far (or 0 if we've never done + // so). Note that our allocation IDs start at 1 (not 0) to ensure that it + // isn't possible for a valid allocation to have a pointer value of + // 0x00000000. + uint32_t current_allocation_id_ = 0; + + // A list of unused allocation IDs. This is for re-use of IDs once they get + // freed. We only bump the max_allocation_id_ once there are no more free + // IDs. + std::vector free_allocation_ids_; + + absl::flat_hash_map allocations_by_id_; + + // Retrieves a free allocation ID, or nullopt if no more IDs are available. + absl::optional TryReserveAllocationID(); + + // Releases an allocation ID back to the pool of IDs. + void ReleaseAllocationID(uint32_t id); + + private: + absl::optional TryCreateTiledAllocation(uint64_t size_in_bytes); + absl::optional TryCreateUntiledAllocation( + uint64_t size_in_bytes); + + friend class D3D12BufferRegion; +}; + +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h new file mode 100644 index 0000000000000..9d52c4e8c0445 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "DmlResourceWrapper.h" +#include "DmlBufferRegion.h" +#include "DmlHeapAllocator.h" + +namespace Dml +{ + class DmlReservedResourceWrapper : public Microsoft::WRL::RuntimeClass, DmlResourceWrapper> + { + public: + DmlReservedResourceWrapper(Allocation&& allocation) : m_allocation(std::move(allocation)) {} + ID3D12Resource* GetResourceInUavState() const final { return m_allocation.resource_uav_state.Get(); } + ID3D12Resource* GetResourceInCopySrcState() const final { return m_allocation.resource_copy_src_state.Get(); } + ID3D12Resource* GetResourceInCopyDstState() const final { return m_allocation.resource_copy_dst_state.Get(); } + + private: + Allocation m_allocation; + }; +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 876487242aa37..e600cee0589d0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,7 +11,9 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: - virtual ID3D12Resource* GetD3D12Resource() const = 0; + virtual ID3D12Resource* GetResourceInUavState() const = 0; + virtual ID3D12Resource* GetResourceInCopySrcState() const = 0; + virtual ID3D12Resource* GetResourceInCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp new file mode 100644 index 0000000000000..ba3f4cb85697e --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlTaggedPointer.h" +#include + +namespace Dml +{ +/*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr) +{ + uint64_t ptr_val = reinterpret_cast(ptr); + + static constexpr uint64_t kAllocationIDMask = + (1ull << kAllocationIDBits) - 1; + static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1; + + TaggedPointer tagged_ptr; + tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask; + tagged_ptr.offset = (ptr_val & kOffsetMask); + + return tagged_ptr; +} + +/*static*/ void* TaggedPointer::Pack(uint32_t allocation_id, uint64_t offset) +{ + assert(allocation_id < (1ull << kAllocationIDBits)); + assert(offset < (1ull << kOffsetBits)); + uint64_t ptr = ((uint64_t)allocation_id << kOffsetBits) | offset; + + return reinterpret_cast(ptr); +} +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h new file mode 100644 index 0000000000000..a161007a138ea --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +namespace Dml +{ + +// D3D12HeapAllocator and D3D12DescriptorHeapAllocator encode the allocation ID +// into the high bits of the pointers it returns, while the low bits are used as +// an offset into the allocation. Note that since the layout of bitfields is +// implementation-defined, you can't just cast a void* into a TaggedPointer: it +// must be done using masks and shifts. +struct TaggedPointer +{ + static constexpr uint64_t kAllocationIDBits = 24; + static constexpr uint64_t kOffsetBits = 40; + + uint64_t allocation_id : kAllocationIDBits; + uint64_t offset : kOffsetBits; + + static void* Pack(uint32_t allocation_id, uint64_t offset); + static TaggedPointer Unpack(const void* ptr); +}; + +static_assert( + sizeof(TaggedPointer) == sizeof(void*), + "DML requires a 64-bit architecture"); +static_assert(TaggedPointer::kAllocationIDBits + TaggedPointer::kOffsetBits == sizeof(void*) * CHAR_BIT, + "DML requires a 64-bit architecture"); +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 3ae8e1483141c..6dc6f046727ab 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -17,8 +17,8 @@ #include "core/graph/indexed_sub_graph.h" #include "core/framework/compute_capability.h" #include "core/framework/fallback_cpu_capability.h" -#include "DmlCommittedResourceAllocator.h" #include "DmlCommittedResourceWrapper.h" +#include "DmlHeapAllocator.h" #ifdef ERROR #undef ERROR @@ -123,7 +123,7 @@ namespace Dml const auto* allocInfo = m_allocator->DecodeDataHandle(allocation.Get()); - ComPtr resource = allocInfo->GetResource(); + ComPtr resource = allocInfo->GetResourceInUavState(); resource.CopyTo(d3dResource); *pooledResource = allocation.Detach(); return S_OK; @@ -136,7 +136,7 @@ namespace Dml ORT_TRY { const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(allocation); - return allocInfo->GetResource(); + return allocInfo->GetResourceInUavState(); } ORT_CATCH_GENERIC { @@ -178,16 +178,20 @@ namespace Dml m_context = std::make_shared(m_d3d12Device.Get(), m_dmlDevice.Get(), queue); + auto heapAllocator = std::make_unique( + m_d3d12Device.Get(), + queue, + CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), + D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + // Create an allocator for D3D12 buffers used to hold tensor data. The returned buffers from the allocator // should be DEFAULT heap buffers which can be used as UAVs, and which start in UAV state. m_allocator = std::make_shared( m_d3d12Device.Get(), m_context, - CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), - D3D12_HEAP_FLAG_NONE, - D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - std::make_unique(m_d3d12Device.Get())); + std::move(heapAllocator)); m_context->SetAllocator(m_allocator); @@ -338,7 +342,7 @@ namespace Dml { assert(tensor->IsDataInterface()); const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(tensor).GetDataInterface().Get()); - ID3D12Resource* resource = allocInfo->GetResource(); + ID3D12Resource* resource = allocInfo->GetResourceInUavState(); D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc(); bufferBindings.push_back({ resource, 0, resourceDesc.Width }); bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); @@ -429,12 +433,19 @@ namespace Dml // const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get()); - ID3D12Resource* dstData = dstAllocInfo->GetResource(); + ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr + ? dstAllocInfo->GetResourceInUavState() + : dstAllocInfo->GetResourceInCopyDstState(); + + // When resources in dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise, + // take the slower path of adding a barrier (e.g. committed resources). + const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; + const void* srcData = src->GetData(); const uint64_t dstOffset = 0; - const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // GPU resources are always kept in UAV state - m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(srcData, dataSizeInBytes)); } else if (!src->IsCpuData() && dst->IsCpuData()) @@ -446,10 +457,17 @@ namespace Dml void* dstData = dst->GetData(); const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get()); - ID3D12Resource* srcData = srcAllocInfo->GetResource(); + ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr + ? srcAllocInfo->GetResourceInUavState() + : srcAllocInfo->GetResourceInCopySrcState(); + + // When resources in src state exist (e.g. reserved resources), we can avoid barriers. Otherwise, + // take the slower path of adding a barrier (e.g. committed resources). + const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; const uint64_t srcOffset = 0; - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // GPU resources are always kept in UAV state // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer m_readbackHeap->ReadbackFromGpu(AsByteSpan(dstData, dataSizeInBytes), srcData, srcOffset, srcState); @@ -462,9 +480,25 @@ namespace Dml const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get()); const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get()); - ID3D12Resource* srcData = srcAllocInfo->GetResource(); - ID3D12Resource* dstData = dstAllocInfo->GetResource(); - m_context->CopyBufferRegion(dstData, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, srcData, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, dataSizeInBytes); + ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr + ? srcAllocInfo->GetResourceInUavState() + : srcAllocInfo->GetResourceInCopySrcState(); + + ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr + ? dstAllocInfo->GetResourceInUavState() + : dstAllocInfo->GetResourceInCopyDstState(); + + // When resources in src and dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise, + // take the slower path of adding a barrier (e.g. committed resources). + const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; + + const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; + + m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes); } else { @@ -488,7 +522,7 @@ namespace Dml if (mlTensor != nullptr) { const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(mlTensor.Get()); - ID3D12Resource* dstData = dstAllocInfo->GetResource(); + ID3D12Resource* dstData = dstAllocInfo->GetResourceInUavState(); m_context->FillBufferWithPattern(dstData, rawValue); } @@ -734,8 +768,16 @@ namespace Dml { // Source and destination for batched GPU -> CPU copies std::vector srcDatas; + srcDatas.reserve(src_dst_pairs.size()); + + std::vector srcStates; + srcStates.reserve(src_dst_pairs.size()); + std::vector dstDatas; + dstDatas.reserve(src_dst_pairs.size()); + std::vector dataSizesInBytes; + dataSizesInBytes.reserve(src_dst_pairs.size()); assert(!m_closed); auto provider = const_cast(this); @@ -776,14 +818,22 @@ namespace Dml dstDatas.push_back(dstWrapper.GetData()); const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(&srcWrapper).GetDataInterface().Get()); - srcDatas.push_back(srcAllocInfo->GetResource()); + auto srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr + ? srcAllocInfo->GetResourceInUavState() + : srcAllocInfo->GetResourceInCopySrcState(); + + auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; + + srcDatas.push_back(srcData); + srcStates.push_back(srcState); } const uint64_t srcOffset = 0; - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // GPU resources are always kept in UAV state // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcStates); return onnxruntime::common::Status::OK(); } @@ -836,10 +886,10 @@ namespace Dml else { #ifdef _GAMING_XBOX - ComPtr wrappedResource = Microsoft::WRL::Make(m_allocator->DecodeDataHandle(data)->GetResource()); + ComPtr wrappedResource = Microsoft::WRL::Make(m_allocator->DecodeDataHandle(data)->GetResourceInUavState()); *abiData = wrappedResource.Detach(); #else - ComPtr resource = m_allocator->DecodeDataHandle(data)->GetResource(); + ComPtr resource = m_allocator->DecodeDataHandle(data)->GetResourceInUavState(); *abiData = resource.Detach(); #endif } @@ -976,7 +1026,7 @@ namespace Dml ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr) { Dml::BucketizedBufferAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->DecodeDataHandle(ptr)->GetResource(); + return pAllocationInfo->DecodeDataHandle(ptr)->GetResourceInUavState(); } void FlushContext(onnxruntime::IExecutionProvider* provider) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index 31aacc3787818..590dffef488e4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -48,7 +48,7 @@ namespace Dml return newCapacity; } - void ReadbackHeap::EnsureReadbackHeap(size_t size) + void ReadbackHeap::EnsureReadbackHeap(size_t size) { if (!m_readbackHeap) { @@ -76,7 +76,7 @@ namespace Dml D3D12_RESOURCE_STATES srcState) { assert(!dst.empty()); - + EnsureReadbackHeap(dst.size()); // Copy from the source resource into the readback heap @@ -100,12 +100,12 @@ namespace Dml memcpy(dst.data(), readbackHeapData, dst.size()); m_readbackHeap->Unmap(0, nullptr); } - + void ReadbackHeap::ReadbackFromGpu( gsl::span dst, gsl::span dstSizes, gsl::span src, - D3D12_RESOURCE_STATES srcState) + gsl::span srcStates) { assert(dst.size() == src.size()); assert(dstSizes.size() == src.size()); @@ -133,7 +133,7 @@ namespace Dml D3D12_RESOURCE_STATE_COPY_DEST, src[i], 0, - srcState, + srcStates[i], dstSizes[i]); offset += dstSizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index c596d982b7931..9727dc6ac8752 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -21,13 +21,13 @@ namespace Dml ID3D12Resource* src, uint64_t srcOffset, D3D12_RESOURCE_STATES srcState); - + // Overload supporting batching void ReadbackFromGpu( gsl::span dst, gsl::span dstSizes, gsl::span src, - D3D12_RESOURCE_STATES srcState); + gsl::span srcStates); private: void EnsureReadbackHeap(size_t size); From 707c1c92f8db0257d501ab378ed71d14edf6dd00 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 17 Jan 2023 22:25:34 -0800 Subject: [PATCH 02/76] WIP --- .../dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 08bffae6e8a5b..10874b0611f7f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -125,7 +125,6 @@ namespace Dml resourceId = ++m_currentResourceId; } - assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize); assert(resourceWrapper != nullptr); ComPtr allocInfo = wil::MakeOrThrow( From 0619fa37e9d15b13d0a53341a41faca0b8de09ee Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 17 Jan 2023 23:50:39 -0800 Subject: [PATCH 03/76] WIP --- .../DmlExecutionProvider/src/BucketizedBufferAllocator.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 10874b0611f7f..c1fa576c48574 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -167,7 +167,11 @@ namespace Dml gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResourceInUavState()->GetDesc().Width) { - assert(gsl::narrow_cast(m_pool.size()) > bucketIndex); + if (gsl::narrow_cast(m_pool.size()) <= bucketIndex) + { + // Ensure there are sufficient buckets + m_pool.resize(bucketIndex + 1); + } // Return the resource to the bucket Bucket* bucket = &m_pool[bucketIndex]; From 6b62b7228197c4d0a89315e6f118723ee455b733 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 18 Jan 2023 22:07:32 -0800 Subject: [PATCH 04/76] WIP --- .../src/BucketizedBufferAllocator.cpp | 41 +++++-------------- .../src/DmlCommandRecorder.cpp | 15 +++++-- .../src/DmlCommittedResourceAllocator.cpp | 28 ------------- .../src/DmlCommittedResourceAllocator.h | 21 ---------- .../src/DmlHeapAllocator.cpp | 10 +++++ .../src/DmlHeapAllocator.h | 1 + .../src/DmlSubAllocator.h | 1 + .../src/ExecutionContext.cpp | 32 +++++++-------- .../cppwinrt/scenariotestscppwinrt.cpp | 12 ++++-- 9 files changed, 58 insertions(+), 103 deletions(-) delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index c1fa576c48574..417d2639dad31 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -84,16 +84,18 @@ namespace Dml ComPtr resourceWrapper; uint64_t resourceId = 0; - uint64_t bucketSize = 0; + + // Find the bucket for this allocation size + gsl::index bucketIndex = GetBucketIndexFromSize(size); + + // Some sub allocators have their own rounding mechanisms or alignment requirements of resources + uint64_t bucketSize = m_subAllocator->ComputeRequiredSize(GetBucketSizeFromIndex(bucketIndex)); // Use a pooled resource if the size (post rounding, if requested) matches a bucket size - if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == GetBucketSizeFromIndex(GetBucketIndexFromSize(size))) + if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == bucketSize) { Bucket* bucket = nullptr; - // Find the bucket for this allocation size - gsl::index bucketIndex = GetBucketIndexFromSize(size); - if (gsl::narrow_cast(m_pool.size()) <= bucketIndex) { // Ensure there are sufficient buckets @@ -101,7 +103,6 @@ namespace Dml } bucket = &m_pool[bucketIndex]; - bucketSize = GetBucketSizeFromIndex(bucketIndex); if (bucket->resources.empty()) { @@ -120,12 +121,13 @@ namespace Dml else { // The allocation will not be pooled. Construct a new one - bucketSize = (size + 3) & ~3; + bucketSize = m_subAllocator->ComputeRequiredSize(size); resourceWrapper = m_subAllocator->Alloc(bucketSize); resourceId = ++m_currentResourceId; } assert(resourceWrapper != nullptr); + assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize); ComPtr allocInfo = wil::MakeOrThrow( this, @@ -183,31 +185,10 @@ namespace Dml { // Free the underlying allocation once queued work has completed. #ifdef _GAMING_XBOX - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInUavState()).Get()); - - if (allocInfo->GetResourceInCopySrcState() != nullptr) - { - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopySrcState()).Get()); - } - - if (allocInfo->GetResourceInCopyDstState() != nullptr) - { - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopyDstState()).Get()); - } + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get()); #else - m_context->QueueReference(allocInfo->GetResourceInUavState()); - - if (allocInfo->GetResourceInCopySrcState() != nullptr) - { - m_context->QueueReference(allocInfo->GetResourceInCopySrcState()); - } - - if (allocInfo->GetResourceInCopyDstState() != nullptr) - { - m_context->QueueReference(allocInfo->GetResourceInCopyDstState()); - } + m_context->QueueReference(allocInfo->DetachResourceWrapper().Get()); #endif - allocInfo->DetachResourceWrapper(); } #if _DEBUG diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index bd6a5c6b7aa17..d16c0201743db 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -101,8 +101,10 @@ void DmlCommandRecorder::InitializeOperator( if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) || (temporaryResourceSize > 0)) { - auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); - m_currentCommandList->ResourceBarrier(1, &uav); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); } } @@ -168,8 +170,13 @@ void DmlCommandRecorder::ExecuteOperator( // Barrier all outputs. #pragma warning(push) #pragma warning(disable: 6387) - auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); - m_currentCommandList->ResourceBarrier(1, &uav); + + // Barrier all outputs. + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); + #pragma warning(pop) } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp deleted file mode 100644 index d9bfdc3473ca7..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "precomp.h" -#include "DmlCommittedResourceAllocator.h" -#include "DmlResourceWrapper.h" -#include "DmlCommittedResourceWrapper.h" - -namespace Dml -{ - ComPtr DmlCommittedResourceAllocator::Alloc(size_t size) - { - ComPtr resource; - auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - ORT_THROW_IF_FAILED(m_device->CreateCommittedResource( - &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), - D3D12_HEAP_FLAG_NONE, - &buffer, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - nullptr, - IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf()) - )); - - ComPtr resourceWrapper; - wil::MakeOrThrow(std::move(resource)).As(&resourceWrapper); - return resourceWrapper; - } -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h deleted file mode 100644 index 7ad48be32a6c9..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "DmlSubAllocator.h" - -namespace Dml -{ - struct DmlResourceWrapper; - - class DmlCommittedResourceAllocator : public DmlSubAllocator - { - public: - DmlCommittedResourceAllocator(ID3D12Device* device) : m_device(device) {} - Microsoft::WRL::ComPtr Alloc(size_t size) final; - - private: - ID3D12Device* m_device = nullptr; - }; -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp index f56312b8ea2cf..2ba44de85b2a8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp @@ -214,6 +214,16 @@ absl::optional D3D12HeapAllocator::TryCreateUntiledAllocation(uint64 return allocation; } +uint64_t D3D12HeapAllocator::ComputeRequiredSize(size_t size) +{ + const uint64_t resource_size_in_tiles = + 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resource_size_in_bytes = + resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + + return resource_size_in_bytes; +} + Microsoft::WRL::ComPtr D3D12HeapAllocator::Alloc(size_t size_in_bytes) { if (size_in_bytes == 0) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h index 877e4b34be6ac..b15eeff3575fe 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h @@ -89,6 +89,7 @@ class D3D12HeapAllocator : public DmlSubAllocator uint64_t size_in_bytes); Microsoft::WRL::ComPtr Alloc(size_t size_in_bytes) final; + uint64_t ComputeRequiredSize(size_t size) final; void Free(void* ptr, uint64_t size_in_bytes); bool TilingEnabled() const { return tiling_enabled_; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h index cfdaf17710001..033fb15388066 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h @@ -11,6 +11,7 @@ namespace Dml { public: virtual Microsoft::WRL::ComPtr Alloc(size_t size) = 0; + virtual uint64_t ComputeRequiredSize(size_t size) = 0; virtual ~DmlSubAllocator(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index a894d0660d6ff..1d41d26cf0062 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -15,7 +15,7 @@ namespace Dml : m_queue(std::make_shared(queue)) , m_dmlRecorder(d3d12Device, dmlDevice, m_queue) { - ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf()))); + ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf()))); } void ExecutionContext::SetAllocator(std::weak_ptr allocator) @@ -55,15 +55,15 @@ namespace Dml m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount); // Reset barrier state - if (!barriers.empty()) + for (auto& barrier : barriers) { - for (auto& barrier : barriers) - { - std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); - } - - m_dmlRecorder.ResourceBarrier(barriers); + std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); } + + // Since this copy may write to GPU memory, we also need to perform an + // aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); + m_dmlRecorder.ResourceBarrier(barriers); } void ExecutionContext::FillBufferWithPattern( @@ -78,14 +78,14 @@ namespace Dml ID3D12GraphicsCommandList* commandList, _Outptr_ ID3D12Fence** fence, _Out_ uint64_t* completionValue - ) + ) { assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); m_dmlRecorder.ExecuteCommandList(commandList, fence, completionValue); } - + void ExecutionContext::InitializeOperator( IDMLCompiledOperator* op, const DML_BINDING_DESC& persistentResourceBinding, @@ -110,7 +110,7 @@ namespace Dml } void ExecutionContext::AddUAVBarrier() - { + { assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -173,9 +173,9 @@ namespace Dml m_currentRecorder = nullptr; SetCommandRecorder(&m_dmlRecorder); } - - void ExecutionContext::QueueReference(IUnknown* object) - { + + void ExecutionContext::QueueReference(IUnknown* object) + { assert(!m_closed); // If something has been recorded into a command list but not submitted yet, it means that the *next* fence // value is the one to signal completion. @@ -186,14 +186,14 @@ namespace Dml void ExecutionContext::Close() { assert(!m_closed); - + // Discard unflushed work and clear queued references. This prevents the circular reference: // Kernel --> ProviderImpl --> Context --> QueuedRefs --> Kernel m_queue->Close(); m_currentRecorder = nullptr; m_closed = true; } - + GpuEvent ExecutionContext::GetCurrentCompletionEvent() { assert(!m_closed); diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp index 5d3561076c6aa..cb195acd33090 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp @@ -1114,7 +1114,11 @@ static void MsftQuantizedModels() { // load a model std::wstring filePath = FileHelpers::GetModulePath() + L"coreml_Resnet50_ImageNet-dq.onnx"; LearningModel model = LearningModel::LoadFromFilePath(filePath); - LearningModelSession session(model, LearningModelDevice(LearningModelDeviceKind::DirectX)); + + auto device = LearningModelDevice(LearningModelDeviceKind::DirectX); + device.as()->SetMetacommandsEnabled(false); + + LearningModelSession session(model, device); // create a binding set LearningModelBinding binding(session); // bind the input and the output buffers by name @@ -1525,7 +1529,7 @@ static void BindMultipleCPUBuffersAsInputs(LearningModelDeviceKind kind) { buffers.Append(wss::Buffer::CreateCopyFromMemoryBuffer(red)); buffers.Append(wss::Buffer::CreateCopyFromMemoryBuffer(green)); buffers.Append(wss::Buffer::CreateCopyFromMemoryBuffer(blue)); - + // Bind input binding.Bind(model.InputFeatures().First().Current().Name(), buffers); @@ -1627,7 +1631,7 @@ static void BindMultipleCPUBuffersAsOutputs(LearningModelDeviceKind kind) { red_buffer.try_as<::Windows::Storage::Streams::IBufferByteAccess>()->Buffer(reinterpret_cast(&red_bytes)); green_buffer.try_as<::Windows::Storage::Streams::IBufferByteAccess>()->Buffer(reinterpret_cast(&green_bytes)); blue_buffer.try_as<::Windows::Storage::Streams::IBufferByteAccess>()->Buffer(reinterpret_cast(&blue_bytes)); - + // Verify the output by comparing with the benchmark image SoftwareBitmap benchmark_bitmap = FileHelpers::GetSoftwareBitmapFromFile(bmImagePath); benchmark_bitmap = SoftwareBitmap::Convert(benchmark_bitmap, BitmapPixelFormat::Bgra8); @@ -1638,7 +1642,7 @@ static void BindMultipleCPUBuffersAsOutputs(LearningModelDeviceKind kind) { wf::IMemoryBufferReference benchmark_reference = benchmark_bitmap_buffer.CreateReference(); auto benchmark_byte_access = benchmark_reference.as<::Windows::Foundation::IMemoryBufferByteAccess>(); benchmark_byte_access->GetBuffer(&benchmark_data, &benchmark_size); - + // hard code, might need to be modified later. const float cMaxErrorRate = 0.06f; byte epsilon = 20; From 3f2910b6eb7abbe21d0beb21182dbfc84938ec34 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 19 Jan 2023 11:47:55 -0800 Subject: [PATCH 05/76] WIP --- .../Api.Image/VideoFrameToTensorConverter.cpp | 16 +++++----------- .../scenario/cppwinrt/scenariotestscppwinrt.cpp | 5 +---- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index c4490db394792..1215548d212c5 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -328,11 +328,6 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( // Validate Tensor Resource { - D3D12_HEAP_PROPERTIES outputHeapProperties; - D3D12_HEAP_FLAGS outputHeapFlags; - - WINML_THROW_IF_FAILED(pOutputResource->GetHeapProperties(&outputHeapProperties, &outputHeapFlags)); - UINT64 ullNumElementsTensor = 1; for (UINT uiIdx = 0; uiIdx < kImageTensorDimensionCountMax; uiIdx++) { WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, tensorDesc.sizes[uiIdx], &ullNumElementsTensor)); @@ -347,8 +342,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( if (outputDesc.Width < ullTensorSize || outputDesc.Height != 1 || outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || - !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) || - outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) { + !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)) { WINML_THROW_IF_FAILED(E_INVALIDARG); } } @@ -533,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( command_list_->ResourceBarrier(1, &barrier); command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize); - + WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; device_cache.GetCommandQueue()->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists); @@ -570,9 +564,9 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor( gpu_buffer_span); upload_heap_->Unmap(0, &CD3DX12_RANGE(0, buffer_size_in_bytes)); - + ResetCommandList(device_cache); - + auto barrier1 = CD3DX12_RESOURCE_BARRIER::Transition(output_resource, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST); command_list_->ResourceBarrier(1, &barrier1); command_list_->CopyBufferRegion(output_resource, 0, upload_heap_.Get(), 0, buffer_size_in_bytes); @@ -692,4 +686,4 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor( inputBounds, reinterpret_cast(pCPUTensor))); } -} \ No newline at end of file +} diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp index cb195acd33090..18e0c28ef4765 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp @@ -1115,10 +1115,7 @@ static void MsftQuantizedModels() { std::wstring filePath = FileHelpers::GetModulePath() + L"coreml_Resnet50_ImageNet-dq.onnx"; LearningModel model = LearningModel::LoadFromFilePath(filePath); - auto device = LearningModelDevice(LearningModelDeviceKind::DirectX); - device.as()->SetMetacommandsEnabled(false); - - LearningModelSession session(model, device); + LearningModelSession session(model, LearningModelDevice(LearningModelDeviceKind::DirectX)); // create a binding set LearningModelBinding binding(session); // bind the input and the output buffers by name From 25bb52d7df70300c6c33d1bd735859ace3f944a9 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 19 Jan 2023 18:24:19 -0800 Subject: [PATCH 06/76] WIP --- .../src/BucketizedBufferAllocator.cpp | 4 +- .../src/BucketizedBufferAllocator.h | 27 +++- .../src/DmlBufferRegion.cc | 120 ------------------ .../src/DmlBufferRegion.h | 82 ------------ .../src/DmlCommandRecorder.cpp | 7 +- .../src/DmlCommittedResourceWrapper.h | 12 +- .../src/DmlHeapAllocator.cpp | 81 ------------ .../src/DmlHeapAllocator.h | 39 ------ .../src/DmlReservedResourceWrapper.h | 11 +- .../src/DmlResourceWrapper.h | 9 +- .../src/DmlTaggedPointer.cpp | 33 ----- .../src/DmlTaggedPointer.h | 34 ----- .../src/ExecutionProvider.cpp | 65 +++------- 13 files changed, 67 insertions(+), 457 deletions(-) delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 417d2639dad31..18c747079f183 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -127,7 +127,7 @@ namespace Dml } assert(resourceWrapper != nullptr); - assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize); + assert(resourceWrapper->GetUavResource()->GetDesc().Width == bucketSize); ComPtr allocInfo = wil::MakeOrThrow( this, @@ -167,7 +167,7 @@ namespace Dml // Free the resource to the pool if its size matches a bucket size gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); - if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResourceInUavState()->GetDesc().Width) + if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetUavResource()->GetDesc().Width) { if (gsl::narrow_cast(m_pool.size()) <= bucketIndex) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 3d95bd029aad8..75025a4af0f8b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -46,19 +46,34 @@ namespace Dml return m_owner; } - ID3D12Resource* GetResourceInUavState() const + ID3D12Resource* GetUavResource() const { - return m_resourceWrapper->GetResourceInUavState(); + return m_resourceWrapper->GetUavResource(); } - ID3D12Resource* GetResourceInCopySrcState() const + ID3D12Resource* GetCopySrcResource() const { - return m_resourceWrapper->GetResourceInCopySrcState(); + return m_resourceWrapper->GetCopySrcResource(); } - ID3D12Resource* GetResourceInCopyDstState() const + ID3D12Resource* GetCopyDstResource() const { - return m_resourceWrapper->GetResourceInCopyDstState(); + return m_resourceWrapper->GetCopyDstResource(); + } + + D3D12_RESOURCE_STATES GetDefaultUavState() const + { + return m_resourceWrapper->GetDefaultUavState(); + } + + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const + { + return m_resourceWrapper->GetDefaultCopySrcState(); + } + + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const + { + return m_resourceWrapper->GetDefaultCopyDstState(); } ComPtr DetachResourceWrapper() const diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc deleted file mode 100644 index 8d6fbd0551083..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "precomp.h" -#include "DmlBufferRegion.h" - -namespace Dml -{ - -D3D12BufferRegion::D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state) - : resource_uav_state_(resource_uav_state), - resource_copy_src_state_(resource_copy_src_state), - resource_copy_dst_state_(resource_copy_dst_state), - offset_(offset), - size_in_bytes_(size_in_bytes) -{ - // Get a raw pointer to the first non-null resource passed in. At least one - // resource must be provided. - first_valid_resource_ = resource_uav_state_; - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_src_state_; - } - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_dst_state_; - } - ORT_THROW_HR_IF(E_UNEXPECTED, first_valid_resource_ == nullptr); - - // Regions cannot be empty. - ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ == 0); - - // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = first_valid_resource_->GetDesc().Width; - ORT_THROW_HR_IF(E_UNEXPECTED, offset_ >= buffer_size); - ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ > buffer_size - offset); - - // All three resources, if provided, must be identical aside from state. - assert( - first_valid_resource_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER); - assert( - !resource_uav_state || - (resource_uav_state->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_uav_state->GetDesc().Width == buffer_size)); - assert( - !resource_copy_src_state_ || - (resource_copy_src_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_src_state_->GetDesc().Width == buffer_size)); - assert( - !resource_copy_dst_state_ || - (resource_copy_dst_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_dst_state_->GetDesc().Width == buffer_size)); -} - -D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) -{ - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); - std::swap(this->offset_, that.offset_); - std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); -} - -D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) -{ - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); - std::swap(this->offset_, that.offset_); - std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); - return *this; -} - -ID3D12Resource* D3D12BufferRegion::GetResourceInUavState() const -{ - return resource_uav_state_; -} - -ID3D12Resource* D3D12BufferRegion::GetResourceInCopySrcState() const -{ - return resource_copy_src_state_; -} - -ID3D12Resource* D3D12BufferRegion::GetResourceInCopyDstState() const -{ - return resource_copy_dst_state_; -} - -uint64_t D3D12BufferRegion::Offset() const -{ - return first_valid_resource_ ? offset_ : 0; -} - -uint64_t D3D12BufferRegion::SizeInBytes() const -{ - return first_valid_resource_ ? size_in_bytes_ : 0; -} - -DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const -{ - if (!resource_uav_state_) - { - return DML_BUFFER_BINDING{}; - } - - return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; -} - -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h deleted file mode 100644 index f8c1033261c56..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -namespace Dml -{ - -class D3D12HeapAllocator; - -// Represents a region of a D3D12 buffer resource. A buffer region has an -// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in -// bytes from the beginning of that buffer, and a size in bytes of the region. -class D3D12BufferRegion -{ - public: - D3D12BufferRegion() = default; - - // References a region of a buffer. The respective ID3D12Resource objects - // must be in the appropriate states. Each resource is optional, but if more - // than one are provided they must map to the same region of memory. - D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state); - - // Move-only - D3D12BufferRegion(const D3D12BufferRegion&) = delete; - D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete; - D3D12BufferRegion(D3D12BufferRegion&&); - D3D12BufferRegion& operator=(D3D12BufferRegion&&); - - ID3D12Resource* GetResourceInUavState() const; - - // NOTE: may be any state that is valid as a copy source (COPY_SRC, - // GENERIC_READ, or COMMON). - ID3D12Resource* GetResourceInCopySrcState() const; - - ID3D12Resource* GetResourceInCopyDstState() const; - - uint64_t Offset() const; - uint64_t SizeInBytes() const; - - DML_BUFFER_BINDING GetBufferBinding() const; - - explicit operator bool() const { return first_valid_resource_ != nullptr; } - - // Creates a subregion at an offset from the start of this region. If no - // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion( - uint64_t offset, - uint64_t size_in_bytes = 0) const - { - // start of subregion must be within current region - ORT_THROW_HR_IF(E_UNEXPECTED, offset >= size_in_bytes_); - size_in_bytes = - size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; - // end of subregion must be within current region - ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes > size_in_bytes_ - offset); - - return D3D12BufferRegion( - offset_ + offset, - size_in_bytes, - resource_uav_state_, - resource_copy_src_state_, - resource_copy_dst_state_); - } - - private: - ID3D12Resource* resource_uav_state_ = nullptr; - ID3D12Resource* resource_copy_src_state_ = nullptr; - ID3D12Resource* resource_copy_dst_state_ = nullptr; - uint64_t offset_ = 0; - uint64_t size_in_bytes_ = 0; - - // Pointer to the first resource above that isn't null. - ID3D12Resource* first_valid_resource_ = nullptr; -}; - -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index d16c0201743db..7f2fdafbbeb60 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -62,12 +62,15 @@ void DmlCommandRecorder::InitializeOperator( // Allocate and immediately free a temporary buffer. The buffer resource will still be // alive (managed by the pool); freeing allows the resource to be shared with other operators. void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize), AllocatorRoundingMode::Enabled); + + + if (!tempResourceHandle) { ORT_THROW_HR(E_OUTOFMEMORY); } - ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState(); + ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource(); allocator->Free(tempResourceHandle); // Bind the temporary resource. @@ -145,7 +148,7 @@ void DmlCommandRecorder::ExecuteOperator( ORT_THROW_HR(E_OUTOFMEMORY); } - ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState(); + ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource(); allocator->Free(tempResourceHandle); // Bind the temporary resource. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index e86ca4b52b4f2..f786cca837f06 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -9,9 +9,15 @@ namespace Dml { public: DmlCommittedResourceWrapper(ComPtr&& d3d12Resource) : m_d3d12Resource(std::move(d3d12Resource)) {} - ID3D12Resource* GetResourceInUavState() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetResourceInCopySrcState() const final { return nullptr; } - ID3D12Resource* GetResourceInCopyDstState() const final { return nullptr; } + + // Committed resources use the same resource for all states and use barriers to transition between states + ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); } + + D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp index 2ba44de85b2a8..bdda99ae6f91a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp @@ -3,8 +3,6 @@ #include "precomp.h" #include "DmlHeapAllocator.h" -#include "DmlTaggedPointer.h" -#include "DmlBufferRegion.h" #include "DmlReservedResourceWrapper.h" namespace Dml @@ -245,83 +243,4 @@ Microsoft::WRL::ComPtr D3D12HeapAllocator::Alloc(size_t size return resourceWrapper; } -void D3D12HeapAllocator::Free(void* ptr, uint64_t size_in_bytes) -{ - ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr); - - TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); - ORT_THROW_HR_IF(E_UNEXPECTED, tagged_ptr.offset != 0); - - // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); - - auto it = allocations_by_id_.find(tagged_ptr.allocation_id); - - ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end()); - - ReleaseAllocationID(tagged_ptr.allocation_id); - - // Frees the ID3D12Heap - allocations_by_id_.erase(it); -} - -D3D12BufferRegion D3D12HeapAllocator::CreateBufferRegion( - const void* ptr, - uint64_t size_in_bytes) -{ - ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr); - - TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); - - // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); - - // Find the allocation corresponding to this pointer - auto it = allocations_by_id_.find(tagged_ptr.allocation_id); - ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end()); - - Allocation* allocation = &it->second; - - return D3D12BufferRegion( - tagged_ptr.offset, - size_in_bytes, - allocation->resource_uav_state.Get(), - allocation->resource_copy_src_state.Get(), - allocation->resource_copy_dst_state.Get()); -} - -absl::optional D3D12HeapAllocator::TryReserveAllocationID() -{ - // The mutex must already be held - assert(!mutex_.try_lock()); - - if (!free_allocation_ids_.empty()) - { - // Return a free ID from the pool - uint32_t id = free_allocation_ids_.back(); - free_allocation_ids_.pop_back(); - return id; - } - - static constexpr uint32_t kMaxAllocationID = - (1 << TaggedPointer::kAllocationIDBits) - 1; - if (current_allocation_id_ == kMaxAllocationID) - { - // We've reached the maximum number of allocations! - return absl::nullopt; - } - - ++current_allocation_id_; - return current_allocation_id_; -} - -void D3D12HeapAllocator::ReleaseAllocationID(uint32_t id) -{ - // The mutex must already be held - assert(!mutex_.try_lock()); - - // Add it to the pool of free IDs - free_allocation_ids_.push_back(id); -} - } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h index b15eeff3575fe..ad86107d6b05c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h @@ -3,9 +3,7 @@ #pragma once -#include "absl/container/flat_hash_map.h" #include "DmlSubAllocator.h" -#include "DmlBufferRegion.h" namespace Dml { @@ -63,12 +61,6 @@ class D3D12HeapAllocator : public DmlSubAllocator // local video memory fragmentation without requiring lots of heaps. static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512; - // The largest single allocation supported by this allocator. We use 4GB - // minus a MB to avoid edge cases in hw/drivers that aren't expecting such - // large allocations. - static constexpr uint64_t kDefaultMaxAllocationSizeInBytes = - (1ull << 32) - (1ull << 20); - D3D12HeapAllocator( ID3D12Device* device, ID3D12CommandQueue* queue, @@ -77,20 +69,8 @@ class D3D12HeapAllocator : public DmlSubAllocator D3D12_RESOURCE_FLAGS resource_flags, D3D12_RESOURCE_STATES initial_state); - // Creates a reserved or placed resource buffer over the given memory range. - // The physical D3D12 resource may be larger than the requested size, so - // callers must ensure to use the offset/size returned in the - // D3D12BufferRegion else risk out of bounds access. Note that in practice - // the ID3D12Resource is cached, so this call typically has a lower cost - // than a call to ID3D12Device::CreatePlacedResource or - // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion( - const void* ptr, - uint64_t size_in_bytes); - Microsoft::WRL::ComPtr Alloc(size_t size_in_bytes) final; uint64_t ComputeRequiredSize(size_t size) final; - void Free(void* ptr, uint64_t size_in_bytes); bool TilingEnabled() const { return tiling_enabled_; }; private: @@ -105,25 +85,6 @@ class D3D12HeapAllocator : public DmlSubAllocator bool tiling_enabled_; uint64_t max_heap_size_in_tiles_; - // The largest allocation ID we've returned so far (or 0 if we've never done - // so). Note that our allocation IDs start at 1 (not 0) to ensure that it - // isn't possible for a valid allocation to have a pointer value of - // 0x00000000. - uint32_t current_allocation_id_ = 0; - - // A list of unused allocation IDs. This is for re-use of IDs once they get - // freed. We only bump the max_allocation_id_ once there are no more free - // IDs. - std::vector free_allocation_ids_; - - absl::flat_hash_map allocations_by_id_; - - // Retrieves a free allocation ID, or nullopt if no more IDs are available. - absl::optional TryReserveAllocationID(); - - // Releases an allocation ID back to the pool of IDs. - void ReleaseAllocationID(uint32_t id); - private: absl::optional TryCreateTiledAllocation(uint64_t size_in_bytes); absl::optional TryCreateUntiledAllocation( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 9d52c4e8c0445..413ade92daf51 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -2,7 +2,6 @@ // Licensed under the MIT License. #include "DmlResourceWrapper.h" -#include "DmlBufferRegion.h" #include "DmlHeapAllocator.h" namespace Dml @@ -11,9 +10,13 @@ namespace Dml { public: DmlReservedResourceWrapper(Allocation&& allocation) : m_allocation(std::move(allocation)) {} - ID3D12Resource* GetResourceInUavState() const final { return m_allocation.resource_uav_state.Get(); } - ID3D12Resource* GetResourceInCopySrcState() const final { return m_allocation.resource_copy_src_state.Get(); } - ID3D12Resource* GetResourceInCopyDstState() const final { return m_allocation.resource_copy_dst_state.Get(); } + ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } + ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } + ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } + + D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; } + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: Allocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index e600cee0589d0..03e9f762b7eb4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,9 +11,12 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: - virtual ID3D12Resource* GetResourceInUavState() const = 0; - virtual ID3D12Resource* GetResourceInCopySrcState() const = 0; - virtual ID3D12Resource* GetResourceInCopyDstState() const = 0; + virtual ID3D12Resource* GetUavResource() const = 0; + virtual ID3D12Resource* GetCopySrcResource() const = 0; + virtual ID3D12Resource* GetCopyDstResource() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp deleted file mode 100644 index ba3f4cb85697e..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "precomp.h" -#include "DmlTaggedPointer.h" -#include - -namespace Dml -{ -/*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr) -{ - uint64_t ptr_val = reinterpret_cast(ptr); - - static constexpr uint64_t kAllocationIDMask = - (1ull << kAllocationIDBits) - 1; - static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1; - - TaggedPointer tagged_ptr; - tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask; - tagged_ptr.offset = (ptr_val & kOffsetMask); - - return tagged_ptr; -} - -/*static*/ void* TaggedPointer::Pack(uint32_t allocation_id, uint64_t offset) -{ - assert(allocation_id < (1ull << kAllocationIDBits)); - assert(offset < (1ull << kOffsetBits)); - uint64_t ptr = ((uint64_t)allocation_id << kOffsetBits) | offset; - - return reinterpret_cast(ptr); -} -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h deleted file mode 100644 index a161007a138ea..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include - -namespace Dml -{ - -// D3D12HeapAllocator and D3D12DescriptorHeapAllocator encode the allocation ID -// into the high bits of the pointers it returns, while the low bits are used as -// an offset into the allocation. Note that since the layout of bitfields is -// implementation-defined, you can't just cast a void* into a TaggedPointer: it -// must be done using masks and shifts. -struct TaggedPointer -{ - static constexpr uint64_t kAllocationIDBits = 24; - static constexpr uint64_t kOffsetBits = 40; - - uint64_t allocation_id : kAllocationIDBits; - uint64_t offset : kOffsetBits; - - static void* Pack(uint32_t allocation_id, uint64_t offset); - static TaggedPointer Unpack(const void* ptr); -}; - -static_assert( - sizeof(TaggedPointer) == sizeof(void*), - "DML requires a 64-bit architecture"); -static_assert(TaggedPointer::kAllocationIDBits + TaggedPointer::kOffsetBits == sizeof(void*) * CHAR_BIT, - "DML requires a 64-bit architecture"); -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 6dc6f046727ab..fddd3267d9770 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -123,7 +123,7 @@ namespace Dml const auto* allocInfo = m_allocator->DecodeDataHandle(allocation.Get()); - ComPtr resource = allocInfo->GetResourceInUavState(); + ComPtr resource = allocInfo->GetUavResource(); resource.CopyTo(d3dResource); *pooledResource = allocation.Detach(); return S_OK; @@ -136,7 +136,7 @@ namespace Dml ORT_TRY { const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(allocation); - return allocInfo->GetResourceInUavState(); + return allocInfo->GetUavResource(); } ORT_CATCH_GENERIC { @@ -342,7 +342,7 @@ namespace Dml { assert(tensor->IsDataInterface()); const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(tensor).GetDataInterface().Get()); - ID3D12Resource* resource = allocInfo->GetResourceInUavState(); + ID3D12Resource* resource = allocInfo->GetUavResource(); D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc(); bufferBindings.push_back({ resource, 0, resourceDesc.Width }); bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); @@ -433,15 +433,8 @@ namespace Dml // const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get()); - ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr - ? dstAllocInfo->GetResourceInUavState() - : dstAllocInfo->GetResourceInCopyDstState(); - - // When resources in dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise, - // take the slower path of adding a barrier (e.g. committed resources). - const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; + ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource(); + const auto dstState = dstAllocInfo->GetDefaultCopyDstState(); const void* srcData = src->GetData(); @@ -457,15 +450,8 @@ namespace Dml void* dstData = dst->GetData(); const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get()); - ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr - ? srcAllocInfo->GetResourceInUavState() - : srcAllocInfo->GetResourceInCopySrcState(); - - // When resources in src state exist (e.g. reserved resources), we can avoid barriers. Otherwise, - // take the slower path of adding a barrier (e.g. committed resources). - const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource(); + const auto srcState = srcAllocInfo->GetDefaultCopySrcState(); const uint64_t srcOffset = 0; @@ -480,23 +466,11 @@ namespace Dml const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get()); const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get()); - ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr - ? srcAllocInfo->GetResourceInUavState() - : srcAllocInfo->GetResourceInCopySrcState(); - - ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr - ? dstAllocInfo->GetResourceInUavState() - : dstAllocInfo->GetResourceInCopyDstState(); + ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource(); + const auto srcState = srcAllocInfo->GetDefaultCopySrcState(); - // When resources in src and dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise, - // take the slower path of adding a barrier (e.g. committed resources). - const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; - - const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; + ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource(); + const auto dstState = dstAllocInfo->GetDefaultCopyDstState(); m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes); } @@ -522,7 +496,7 @@ namespace Dml if (mlTensor != nullptr) { const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(mlTensor.Get()); - ID3D12Resource* dstData = dstAllocInfo->GetResourceInUavState(); + ID3D12Resource* dstData = dstAllocInfo->GetUavResource(); m_context->FillBufferWithPattern(dstData, rawValue); } @@ -818,13 +792,8 @@ namespace Dml dstDatas.push_back(dstWrapper.GetData()); const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(&srcWrapper).GetDataInterface().Get()); - auto srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr - ? srcAllocInfo->GetResourceInUavState() - : srcAllocInfo->GetResourceInCopySrcState(); - - auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + auto srcData = srcAllocInfo->GetCopySrcResource(); + auto srcState = srcAllocInfo->GetDefaultCopySrcState(); srcDatas.push_back(srcData); srcStates.push_back(srcState); @@ -886,10 +855,10 @@ namespace Dml else { #ifdef _GAMING_XBOX - ComPtr wrappedResource = Microsoft::WRL::Make(m_allocator->DecodeDataHandle(data)->GetResourceInUavState()); + ComPtr wrappedResource = Microsoft::WRL::Make(m_allocator->DecodeDataHandle(data)->GetUavResource()); *abiData = wrappedResource.Detach(); #else - ComPtr resource = m_allocator->DecodeDataHandle(data)->GetResourceInUavState(); + ComPtr resource = m_allocator->DecodeDataHandle(data)->GetUavResource(); *abiData = resource.Detach(); #endif } @@ -1026,7 +995,7 @@ namespace Dml ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr) { Dml::BucketizedBufferAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->DecodeDataHandle(ptr)->GetResourceInUavState(); + return pAllocationInfo->DecodeDataHandle(ptr)->GetUavResource(); } void FlushContext(onnxruntime::IExecutionProvider* provider) From 92f51a33835b1f2654f2f8294b9979ef2a39c28c Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 23 Jan 2023 10:32:14 -0800 Subject: [PATCH 07/76] Remove sub allocator --- .../src/BucketizedBufferAllocator.cpp | 4 ++-- .../src/BucketizedBufferAllocator.h | 6 +++--- .../DmlExecutionProvider/src/DmlHeapAllocator.h | 8 ++++---- .../DmlExecutionProvider/src/DmlSubAllocator.h | 17 ----------------- 4 files changed, 9 insertions(+), 26 deletions(-) delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 18c747079f183..79a195529679d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -6,7 +6,7 @@ #include "core/session/onnxruntime_c_api.h" #include "BucketizedBufferAllocator.h" -#include "DmlSubAllocator.h" +#include "DmlHeapAllocator.h" // #define PRINT_OUTSTANDING_ALLOCATIONS namespace Dml @@ -37,7 +37,7 @@ namespace Dml BucketizedBufferAllocator::BucketizedBufferAllocator( ID3D12Device* device, std::shared_ptr context, - std::unique_ptr&& subAllocator + std::unique_ptr&& subAllocator ) : onnxruntime::IAllocator( OrtMemoryInfo( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 75025a4af0f8b..254631652cc47 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -9,7 +9,7 @@ namespace Dml { - class DmlSubAllocator; + class D3D12HeapAllocator; class CPUAllocator : public onnxruntime::IAllocator { @@ -120,7 +120,7 @@ namespace Dml BucketizedBufferAllocator( ID3D12Device* device, std::shared_ptr context, - std::unique_ptr&& subAllocator); + std::unique_ptr&& subAllocator); // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc. const AllocationInfo* DecodeDataHandle(const void* opaqueHandle); @@ -168,7 +168,7 @@ namespace Dml uint64_t m_currentResourceId = 0; AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled; std::shared_ptr m_context; - std::unique_ptr m_subAllocator; + std::unique_ptr m_subAllocator; #if _DEBUG // Useful for debugging; keeps track of all allocations that haven't been freed yet diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h index ad86107d6b05c..6e13ad71f5877 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h @@ -3,7 +3,7 @@ #pragma once -#include "DmlSubAllocator.h" +#include "DmlResourceWrapper.h" namespace Dml { @@ -53,7 +53,7 @@ struct Allocation // this case it is better make more but smaller allocations (resulting in // smaller heaps); this fallback path is only retained as a last resort for // older hardware. -class D3D12HeapAllocator : public DmlSubAllocator +class D3D12HeapAllocator { public: // Maximum size of a heap (in tiles) when allocations are tiled. Each tile @@ -69,8 +69,8 @@ class D3D12HeapAllocator : public DmlSubAllocator D3D12_RESOURCE_FLAGS resource_flags, D3D12_RESOURCE_STATES initial_state); - Microsoft::WRL::ComPtr Alloc(size_t size_in_bytes) final; - uint64_t ComputeRequiredSize(size_t size) final; + Microsoft::WRL::ComPtr Alloc(size_t size_in_bytes); + uint64_t ComputeRequiredSize(size_t size); bool TilingEnabled() const { return tiling_enabled_; }; private: diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h deleted file mode 100644 index 033fb15388066..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -namespace Dml -{ - struct DmlResourceWrapper; - - class DmlSubAllocator - { - public: - virtual Microsoft::WRL::ComPtr Alloc(size_t size) = 0; - virtual uint64_t ComputeRequiredSize(size_t size) = 0; - virtual ~DmlSubAllocator(){} - }; -} From c0cbcaeb687100a2e5008233ea45416a0e8ae358 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 24 Jan 2023 00:00:36 -0800 Subject: [PATCH 08/76] WIP --- .../inc/DmlExecutionProvider.h | 6 +- .../inc/IWinmlExecutionProvider.h | 25 +- .../src/BucketizedBufferAllocator.cpp | 452 +++++++++++++----- .../src/BucketizedBufferAllocator.h | 213 ++++----- .../src/DmlAllocationInfo.cpp | 19 + .../src/DmlAllocationInfo.h | 80 ++++ .../src/DmlBfcAllocator.h | 29 ++ .../src/DmlBufferRegion.cc | 120 +++++ .../src/DmlBufferRegion.h | 79 +++ .../src/DmlCommandRecorder.cpp | 38 +- .../src/DmlCommandRecorder.h | 16 +- .../src/DmlCpuAllocator.cpp | 38 ++ .../src/DmlCpuAllocator.h | 20 + .../src/DmlGpuAllocator.h | 39 ++ .../src/DmlGraphFusionHelper.cpp | 6 +- .../src/DmlHeapAllocation.h | 29 ++ .../src/DmlHeapAllocator.cpp | 246 ---------- .../src/DmlHeapAllocator.h | 96 ---- .../src/DmlManagedBufferRegion.h | 26 + .../src/DmlReservedResourceWrapper.h | 13 +- .../src/DmlTaggedPointer.cpp | 41 ++ .../src/DmlTaggedPointer.h | 43 ++ .../src/ExecutionContext.cpp | 5 +- .../src/ExecutionContext.h | 13 +- .../src/ExecutionProvider.cpp | 184 +++---- .../src/ExecutionProvider.h | 33 +- .../src/FusedGraphKernel.cpp | 22 +- .../src/IExecutionProvider.h | 8 +- .../src/MLOperatorAuthorImpl.cpp | 22 +- .../src/Operators/DmlOperator.cpp | 31 +- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 3 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 1 + .../MLOperatorAuthorHelper.h | 4 - 33 files changed, 1222 insertions(+), 778 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index 9dfbd0e7ea0e0..fe07ccf08899e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -31,12 +31,12 @@ namespace Dml bool enableMetacommands = true); ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr); - void FlushContext(onnxruntime::IExecutionProvider* provider); + void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); - + onnxruntime::common::Status CopyTensor( - onnxruntime::IExecutionProvider* provider, + onnxruntime::IExecutionProvider* provider, const onnxruntime::Tensor& src, onnxruntime::Tensor& dst ); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index 501a66bdfa711..52f5a104b0379 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -22,6 +22,11 @@ namespace onnxruntime class Node; } +namespace Dml +{ + class DmlManagedBufferRegion; +} + namespace Windows::AI::MachineLearning::Adapter { interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698")) @@ -29,7 +34,7 @@ namespace Windows::AI::MachineLearning::Adapter { public: // Hold a reference to an object until preceding work in the queue is complete. This - // only needs to be handled by providers which hide the asynchronous nature of + // only needs to be handled by providers which hide the asynchronous nature of // computation, and involve resoures which cannot be automatically by work in the // the provider's underlying queues. virtual void QueueReference(IUnknown *object) = 0; @@ -40,12 +45,16 @@ namespace Windows::AI::MachineLearning::Adapter IUnknown** dataCopy) const = 0; virtual void GetABIDataInterface( - bool isInternalOperator, - IUnknown* data, + void* data, IUnknown** abiData) const = 0; - + + virtual void GetManagedBufferRegion( + void* data, + uint64_t size, + Dml::DmlManagedBufferRegion** abiData) const = 0; + virtual uint64_t TryGetPooledAllocationId( - IUnknown* data, + void* data, bool isInternalOperator) = 0; virtual void GetABIExecutionInterfaceAndInvalidateState( @@ -63,7 +72,7 @@ namespace Windows::AI::MachineLearning::Adapter uint32_t resourceCount, IUnknown** resources) = 0; - // Waits for flushed work, discards unflushed work, and discards associated references to + // Waits for flushed work, discards unflushed work, and discards associated references to // prevent circular references. Must be the last call on the object before destruction. virtual void Close() = 0; }; @@ -89,7 +98,7 @@ namespace Windows::AI::MachineLearning::Adapter }; using GraphNodeFactory = std::function>; -} \ No newline at end of file +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 79a195529679d..df12c1567d5be 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -4,21 +4,13 @@ #include "precomp.h" #include "core/session/onnxruntime_c_api.h" - #include "BucketizedBufferAllocator.h" -#include "DmlHeapAllocator.h" -// #define PRINT_OUTSTANDING_ALLOCATIONS +#include "DmlReservedResourceWrapper.h" +#include "DmlBufferRegion.h" +#include "DmlManagedBufferRegion.h" namespace Dml { - AllocationInfo::~AllocationInfo() - { - if (m_owner) - { - m_owner->FreeResource(this, m_pooledResourceId); - } - } - BucketizedBufferAllocator::~BucketizedBufferAllocator() { #ifdef PRINT_OUTSTANDING_ALLOCATIONS @@ -34,24 +26,6 @@ namespace Dml #endif } - BucketizedBufferAllocator::BucketizedBufferAllocator( - ID3D12Device* device, - std::shared_ptr context, - std::unique_ptr&& subAllocator - ) - : onnxruntime::IAllocator( - OrtMemoryInfo( - "DML", - OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) - ) - ), - m_device(device), - m_context(context), - m_subAllocator(std::move(subAllocator)) - { - } - /*static*/ gsl::index BucketizedBufferAllocator::GetBucketIndexFromSize(uint64_t size) { assert(size != 0); @@ -72,88 +46,287 @@ namespace Dml return (1ull << (index + c_minResourceSizeExponent)); } - void* BucketizedBufferAllocator::Alloc(size_t size) + void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { - return Alloc(size, m_defaultRoundingMode); + m_defaultRoundingMode = roundingMode; } - void* BucketizedBufferAllocator::Alloc(size_t size, AllocatorRoundingMode roundingMode) + static bool GetTilingEnabled(ID3D12Device* device) { - // For some reason lotus likes requesting 0 bytes of memory - size = std::max(1, size); + D3D12_FEATURE_DATA_D3D12_OPTIONS options = {}; + if (SUCCEEDED(device->CheckFeatureSupport( + D3D12_FEATURE_D3D12_OPTIONS, + &options, + sizeof(options)))) + { + return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1; + } - ComPtr resourceWrapper; - uint64_t resourceId = 0; + return false; + } - // Find the bucket for this allocation size - gsl::index bucketIndex = GetBucketIndexFromSize(size); + static uint64_t GetMaxHeapSizeInTiles() + { + return BucketizedBufferAllocator::kDefaultMaxHeapSizeInTiles; + } - // Some sub allocators have their own rounding mechanisms or alignment requirements of resources - uint64_t bucketSize = m_subAllocator->ComputeRequiredSize(GetBucketSizeFromIndex(bucketIndex)); + BucketizedBufferAllocator::BucketizedBufferAllocator( + ID3D12Device* device, + ID3D12CommandQueue* queue, + const D3D12_HEAP_PROPERTIES& heap_props, + D3D12_HEAP_FLAGS heap_flags, + D3D12_RESOURCE_FLAGS resource_flags, + D3D12_RESOURCE_STATES initial_state) + : device_(device), + queue_(queue), + heap_properties_(heap_props), + heap_flags_(heap_flags), + resource_flags_(resource_flags), + initial_state_(initial_state), + tiling_enabled_(GetTilingEnabled(device)), + max_heap_size_in_tiles_(GetMaxHeapSizeInTiles()) + { + } - // Use a pooled resource if the size (post rounding, if requested) matches a bucket size - if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == bucketSize) + absl::optional BucketizedBufferAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes) + { + DmlHeapAllocation allocation = {}; + + // The allocation may be larger than the requested size to ensure a whole + // number of tiles. + const uint64_t resource_size_in_tiles = + 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resource_size_in_bytes = + resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + auto resource_desc = + CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_); + + ID3D12Resource** resources[] = { + &allocation.resource_uav_state, + &allocation.resource_copy_src_state, + &allocation.resource_copy_dst_state}; + + D3D12_RESOURCE_STATES states[] = { + initial_state_, + D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COPY_DEST}; + + for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) { - Bucket* bucket = nullptr; + HRESULT create_resource_hr = device_->CreateReservedResource( + &resource_desc, + states[i], + nullptr, + IID_PPV_ARGS(resources[i])); - if (gsl::narrow_cast(m_pool.size()) <= bucketIndex) + if (create_resource_hr == E_OUTOFMEMORY) { - // Ensure there are sufficient buckets - m_pool.resize(bucketIndex + 1); + return absl::nullopt; } + ORT_THROW_IF_FAILED(create_resource_hr); + } + + // Reserve enough heaps to store all tiles in the resource. + const uint64_t heap_count = + 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_; + allocation.heaps.resize(heap_count); - bucket = &m_pool[bucketIndex]; + // Create heaps and map them to the primary reserved resource. + D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {}; + uint64_t unmapped_resource_tiles = resource_size_in_tiles; + for (uint64_t i = 0; i < heap_count; i++) + { + // Create heap. The last heap of the allocation may have fewer tiles to + // avoid wasting space. + uint64_t heap_size_in_tiles = std::min( + unmapped_resource_tiles, + max_heap_size_in_tiles_); + uint64_t heap_size_in_bytes = + heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + auto heap_desc = CD3DX12_HEAP_DESC( + heap_size_in_bytes, + heap_properties_, + 0, + heap_flags_); - if (bucket->resources.empty()) + HRESULT create_heap_hr = + device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); + if (create_heap_hr == E_OUTOFMEMORY) { - // No more resources in this bucket - allocate a new one - resourceWrapper = m_subAllocator->Alloc(bucketSize); - resourceId = ++m_currentResourceId; + return absl::nullopt; } - else + ORT_THROW_IF_FAILED(create_heap_hr); + + // Source region in the resource to map. + D3D12_TILE_REGION_SIZE resource_region_size = {}; + resource_region_size.NumTiles = static_cast(heap_size_in_tiles); + + // Target range in the current heap to map. + const D3D12_TILE_RANGE_FLAGS tile_range_flags = + D3D12_TILE_RANGE_FLAG_NONE; + const uint32_t heap_range_start_offset = 0; + const uint32_t heap_range_tile_count = static_cast(heap_size_in_tiles); + + constexpr uint32_t numResourceRegions = 1; + constexpr uint32_t numHeapRanges = 1; + + // This is a brand new allocation/resource, so the tile mappings are + // guaranteed to be set (on the GPU timeline) by the time any code can + // reference the returned resource. We only execute operations on a + // single hardware queue so there is no need to wait or signal. + // + // All resources have identical tile mappings. The repeated call to + // UpdateTileMappings on all resources instead of using CopyTileMappings + // is intentional: the latter API is not supported by all versions of + // PIX. + for (auto resource : + {allocation.resource_uav_state.Get(), + allocation.resource_copy_src_state.Get(), + allocation.resource_copy_dst_state.Get()}) { - // Retrieve a resource from the bucket - resourceWrapper = std::move(bucket->resources.back().resource); - resourceId = bucket->resources.back().resourceId; - bucket->resources.pop_back(); + queue_->UpdateTileMappings( + resource, + numResourceRegions, + &resource_region_start_coordinates, + &resource_region_size, + allocation.heaps[i].Get(), + numHeapRanges, + &tile_range_flags, + &heap_range_start_offset, + &heap_range_tile_count, + D3D12_TILE_MAPPING_FLAG_NONE); } + + resource_region_start_coordinates.X += static_cast(heap_size_in_tiles); + unmapped_resource_tiles -= heap_size_in_tiles; } - else + + assert(unmapped_resource_tiles == 0); + + return allocation; + } + + absl::optional BucketizedBufferAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes) + { + DmlHeapAllocation allocation = {}; + + // Create the allocation's sole heap. The allocation may be larger than the + // requested size to ensure a whole number of tiles. + allocation.heaps.resize(1); + D3D12_HEAP_DESC heap_desc = + CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_); + HRESULT create_heap_hr = device_->CreateHeap( + &heap_desc, + IID_PPV_ARGS(&allocation.heaps.front())); + if (create_heap_hr == E_OUTOFMEMORY) { - // The allocation will not be pooled. Construct a new one - bucketSize = m_subAllocator->ComputeRequiredSize(size); - resourceWrapper = m_subAllocator->Alloc(bucketSize); - resourceId = ++m_currentResourceId; + return absl::nullopt; } - assert(resourceWrapper != nullptr); - assert(resourceWrapper->GetUavResource()->GetDesc().Width == bucketSize); + // Create large placed resource that spans the heap. + D3D12_RESOURCE_DESC resource_desc = + CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_); + + ID3D12Resource** resources[] = { + &allocation.resource_uav_state, + &allocation.resource_copy_src_state, + &allocation.resource_copy_dst_state}; + D3D12_RESOURCE_STATES states[] = { + initial_state_, + D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COPY_DEST}; + + for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) + { + HRESULT create_resource_hr = device_->CreatePlacedResource( + allocation.heaps.front().Get(), + 0, + &resource_desc, + states[i], + nullptr, + IID_PPV_ARGS(resources[i])); + if (create_resource_hr == E_OUTOFMEMORY) + { + return absl::nullopt; + } + ORT_THROW_IF_FAILED(create_resource_hr); + } + + return allocation; + } + + uint64_t BucketizedBufferAllocator::ComputeRequiredSize(size_t size) + { + const uint64_t resource_size_in_tiles = + 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resource_size_in_bytes = + resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + + return resource_size_in_bytes; + } + + void* BucketizedBufferAllocator::Alloc(size_t size_in_bytes) + { + // For some reason lotus likes requesting 0 bytes of memory + size_in_bytes = std::max(1, size_in_bytes); + + // The D3D12 device is thread-safe so we don't need to hold the lock while + // creating an allocation. + absl::optional allocation = + tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes) + : TryCreateUntiledAllocation(size_in_bytes); + + ORT_THROW_HR_IF(E_INVALIDARG, !allocation); + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + absl::optional allocationId = TryReserveAllocationID(); + ORT_THROW_HR_IF(E_INVALIDARG, !allocationId); + + auto resourceWrapper = wil::MakeOrThrow(std::move(*allocation)); ComPtr allocInfo = wil::MakeOrThrow( this, ++m_currentAllocationId, - resourceId, + ++m_currentResourceId, resourceWrapper.Get(), - size + size_in_bytes ); + allocations_by_id_.emplace(*allocationId, allocInfo); + + lock.unlock(); + #if _DEBUG m_outstandingAllocationsById[allocInfo->GetId()] = allocInfo.Get(); #endif - return allocInfo.Detach(); + // DML only has a single device in ORT at the moment + const uint64_t device_id = 0; + const uint64_t offset = 0; + return TaggedPointer::Pack(device_id, *allocationId, offset); } - void BucketizedBufferAllocator::Free(void* p) + void BucketizedBufferAllocator::Free(void* ptr) { - // Release Lotus's reference on the allocation. The allocation - // also inherits IUnknown, and once its final reference reaches zero - // it will call FreeResource - ComPtr allocInfo; - allocInfo.Attach(static_cast(p)); + ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); + + TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); + ORT_THROW_HR_IF(E_INVALIDARG, tagged_ptr.offset != 0); + + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); + + ReleaseAllocationID(tagged_ptr.allocation_id); + + // Frees the ID3D12Heap + allocations_by_id_.erase(it); } - void BucketizedBufferAllocator::FreeResource(void* p, uint64_t pooledResourceId) + void BucketizedBufferAllocator::FreeResource(void* p, uint64_t pooledResourceId) { AllocationInfo *allocInfo = static_cast(p); @@ -165,31 +338,12 @@ namespace Dml ORT_THROW_HR(E_INVALIDARG); } - // Free the resource to the pool if its size matches a bucket size - gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); - if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetUavResource()->GetDesc().Width) - { - if (gsl::narrow_cast(m_pool.size()) <= bucketIndex) - { - // Ensure there are sufficient buckets - m_pool.resize(bucketIndex + 1); - } - - // Return the resource to the bucket - Bucket* bucket = &m_pool[bucketIndex]; - - Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId}; - bucket->resources.push_back(resource); - } - else - { - // Free the underlying allocation once queued work has completed. + // Free the underlying allocation once queued work has completed. #ifdef _GAMING_XBOX - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get()); + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get()); #else - m_context->QueueReference(allocInfo->DetachResourceWrapper().Get()); + m_context->QueueReference(allocInfo->DetachResourceWrapper().Get()); #endif - } #if _DEBUG assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo); @@ -199,58 +353,100 @@ namespace Dml // The allocation info is already destructing at this point } - - const AllocationInfo* BucketizedBufferAllocator::DecodeDataHandle(const void* opaqueHandle) + absl::optional BucketizedBufferAllocator::TryReserveAllocationID() { - if (opaqueHandle == nullptr) + // The mutex must already be held + assert(!mutex_.try_lock()); + + if (!free_allocation_ids_.empty()) { - // There is no memory allocated which needs to be decoded. - ORT_THROW_HR(E_INVALIDARG); + // Return a free ID from the pool + uint32_t id = free_allocation_ids_.back(); + free_allocation_ids_.pop_back(); + return id; } - const auto* allocInfo = static_cast(opaqueHandle); - auto owner = allocInfo->GetOwner(); - //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource - if (owner != nullptr && owner != this) + static constexpr uint32_t kMaxAllocationID = + (1 << TaggedPointer::kAllocationIDBits) - 1; + if (current_allocation_id_ == kMaxAllocationID) { - // This allocation doesn't belong to this allocator! - ORT_THROW_HR(E_INVALIDARG); + // We've reached the maximum number of allocations! + return absl::nullopt; } - return allocInfo; + ++current_allocation_id_; + return current_allocation_id_; } - void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) + void BucketizedBufferAllocator::ReleaseAllocationID(uint32_t id) { - m_defaultRoundingMode = roundingMode; + // The mutex must already be held + assert(!mutex_.try_lock()); + + // Add it to the pool of free IDs + free_allocation_ids_.push_back(id); } - CPUAllocator::CPUAllocator(OrtMemType memType) - : onnxruntime::IAllocator( - OrtMemoryInfo( - "DML CPU", - OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0), - 0, - memType - ) - ) + D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion( + const void* ptr, + uint64_t size_in_bytes) { + ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); + + TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); + + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + // Find the allocation corresponding to this pointer + auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); + + return D3D12BufferRegion( + tagged_ptr.offset, + size_in_bytes, + it->second->GetUavResource(), + it->second->GetCopySrcResource(), + it->second->GetCopyDstResource()); } - void* CPUAllocator::Alloc(size_t size) + ComPtr BucketizedBufferAllocator::CreateManagedBufferRegion( + const void* ptr, + uint64_t size_in_bytes) { - if (size <= 0) - { - return nullptr; - } - void* p = malloc(size); - return p; + ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); + + TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); + + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + // Find the allocation corresponding to this pointer + auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); + + D3D12BufferRegion bufferRegion( + tagged_ptr.offset, + size_in_bytes, + it->second->GetUavResource(), + it->second->GetCopySrcResource(), + it->second->GetCopyDstResource()); + + return wil::MakeOrThrow(it->second, std::move(bufferRegion)); } - void CPUAllocator::Free(void* p) + AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const void* ptr) { - free(p); + ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); + + TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); + + // We need to access (mutable) state after this point, so we need to lock + std::unique_lock lock(mutex_); + + // Find the allocation corresponding to this pointer + auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + return it->second.Get(); } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 254631652cc47..f21d174500fcb 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -3,116 +3,76 @@ #pragma once -#include "core/framework/allocator.h" #include "ExecutionContext.h" -#include "DmlResourceWrapper.h" +#include "DmlAllocationInfo.h" +#include "DmlBufferRegion.h" namespace Dml { - class D3D12HeapAllocator; - - class CPUAllocator : public onnxruntime::IAllocator - { - public: - explicit CPUAllocator(OrtMemType memType); - - void* Alloc(size_t size) override; - void Free(void* p) override; - }; - + class BucketizedBufferAllocator; class BucketizedBufferAllocator; - class AllocationInfo : public Microsoft::WRL::RuntimeClass< - Microsoft::WRL::RuntimeClassFlags, IUnknown> + // An allocator that makes logically contiguous allocations backed by D3D heaps. + // + // Heaps must fit entirely in either local or non-local memory. Larger heaps + // have a greater chance of getting demoted into non-local memory, which can be + // disastrous for performance. This problem is compounded by the fact that heaps + // may be demoted even if overall local memory usage is within the process' + // budget. Heaps are not necessarily mappable to discontiguous regions of + // physical memory, which means physical memory fragmentation *may* make it + // extremely difficult to accommodate larger heaps. + // + // On D3D hardware that supports tiled resource tier 1+ this class implements + // large allocations through tiling. Each allocation is backed by however many + // small heaps are necessary to cover the requested allocation size. Buffer + // regions retrieved through this allocator are reserved resources that span the + // full collection of heaps assigned to an individual allocation. Tile mappings + // are static. + // + // On hardware that doesn't support tiled resources each allocation is backed by + // a single heap. Buffer regions retrieved through this allocator are placed + // resources that span the full heap assigned to an individual allocation. In + // this case it is better make more but smaller allocations (resulting in + // smaller heaps); this fallback path is only retained as a last resort for + // older hardware. + class BucketizedBufferAllocator { public: - AllocationInfo( - BucketizedBufferAllocator* owner, - size_t id, - uint64_t pooledResourceId, - DmlResourceWrapper* resourceWrapper, - size_t requestedSize) - : m_owner(owner) - , m_allocationId(id) - , m_pooledResourceId(pooledResourceId) - , m_resourceWrapper(resourceWrapper) - , m_requestedSize(requestedSize) - {} - - ~AllocationInfo(); - - BucketizedBufferAllocator* GetOwner() const - { - return m_owner; - } - - ID3D12Resource* GetUavResource() const - { - return m_resourceWrapper->GetUavResource(); - } - - ID3D12Resource* GetCopySrcResource() const - { - return m_resourceWrapper->GetCopySrcResource(); - } - - ID3D12Resource* GetCopyDstResource() const - { - return m_resourceWrapper->GetCopyDstResource(); - } - - D3D12_RESOURCE_STATES GetDefaultUavState() const - { - return m_resourceWrapper->GetDefaultUavState(); - } - - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const - { - return m_resourceWrapper->GetDefaultCopySrcState(); - } - - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const - { - return m_resourceWrapper->GetDefaultCopyDstState(); - } - - ComPtr DetachResourceWrapper() const - { - return std::move(m_resourceWrapper); - } - - size_t GetRequestedSize() const - { - return m_requestedSize; - } + // Maximum size of a heap (in tiles) when allocations are tiled. Each tile + // is 64KB. A default size of 512 tiles (32MB) does a good job of handling + // local video memory fragmentation without requiring lots of heaps. + static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512; - size_t GetId() const - { - return m_allocationId; - } - - uint64_t GetPooledResourceId() const - { - return m_pooledResourceId; - } - - private: - BucketizedBufferAllocator* m_owner; - size_t m_allocationId; // For debugging purposes - uint64_t m_pooledResourceId = 0; - ComPtr m_resourceWrapper; - - // The size requested during Alloc(), which may be smaller than the physical resource size - size_t m_requestedSize; - }; + BucketizedBufferAllocator( + ID3D12Device* device, + ID3D12CommandQueue* queue, + const D3D12_HEAP_PROPERTIES& heap_props, + D3D12_HEAP_FLAGS heap_flags, + D3D12_RESOURCE_FLAGS resource_flags, + D3D12_RESOURCE_STATES initial_state); + + // Creates a reserved or placed resource buffer over the given memory range. + // The physical D3D12 resource may be larger than the requested size, so + // callers must ensure to use the offset/size returned in the + // D3D12BufferRegion else risk out of bounds access. Note that in practice + // the ID3D12Resource is cached, so this call typically has a lower cost + // than a call to ID3D12Device::CreatePlacedResource or + // CreateReservedResource. + D3D12BufferRegion CreateBufferRegion( + const void* ptr, + uint64_t size_in_bytes); + + ComPtr CreateManagedBufferRegion( + const void* ptr, + uint64_t size_in_bytes); + + AllocationInfo* GetAllocationInfo(const void* ptr); + + void* Alloc(size_t size_in_bytes); + void Free(void* ptr); + uint64_t ComputeRequiredSize(size_t size); + bool TilingEnabled() const { return tiling_enabled_; }; - // Implements a Lotus allocator for D3D12 heap buffers, using a bucket allocation strategy. The allocator - // maintains a set of fixed-size buckets, with each bucket containing one or more D3D12 buffers of that fixed size. - // All requested allocation sizes are rounded up to the nearest bucket size, which ensures minimal fragmentation - // while providing an upper bound on the amount of memory "wasted" with each allocation. - class BucketizedBufferAllocator : public onnxruntime::IAllocator - { - public: ~BucketizedBufferAllocator(); // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties, @@ -120,18 +80,10 @@ namespace Dml BucketizedBufferAllocator( ID3D12Device* device, std::shared_ptr context, - std::unique_ptr&& subAllocator); - - // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc. - const AllocationInfo* DecodeDataHandle(const void* opaqueHandle); + std::unique_ptr&& subAllocator); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); - public: // onnxruntime::IAllocator - void* Alloc(size_t size, AllocatorRoundingMode roundingMode); - void* Alloc(size_t size) final; - void Free(void* p) final; - private: static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB @@ -152,12 +104,6 @@ namespace Dml static gsl::index GetBucketIndexFromSize(uint64_t size); static uint64_t GetBucketSizeFromIndex(gsl::index index); - AllocationInfo* DecodeDataHandleInternal(void* opaqueHandle) - { - // Implement in terms of const version - return const_cast(DecodeDataHandle(static_cast(opaqueHandle))); - } - friend class AllocationInfo; void FreeResource(void* p, uint64_t resourceId); @@ -168,12 +114,47 @@ namespace Dml uint64_t m_currentResourceId = 0; AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled; std::shared_ptr m_context; - std::unique_ptr m_subAllocator; + std::unique_ptr m_subAllocator; #if _DEBUG // Useful for debugging; keeps track of all allocations that haven't been freed yet std::map m_outstandingAllocationsById; #endif + + std::mutex mutex_; + + Microsoft::WRL::ComPtr device_; + Microsoft::WRL::ComPtr queue_; + const D3D12_HEAP_PROPERTIES heap_properties_; + const D3D12_HEAP_FLAGS heap_flags_; + const D3D12_RESOURCE_FLAGS resource_flags_; + const D3D12_RESOURCE_STATES initial_state_; + bool tiling_enabled_; + uint64_t max_heap_size_in_tiles_; + + // The largest allocation ID we've returned so far (or 0 if we've never done + // so). Note that our allocation IDs start at 1 (not 0) to ensure that it + // isn't possible for a valid allocation to have a pointer value of + // 0x00000000. + uint32_t current_allocation_id_ = 0; + + // A list of unused allocation IDs. This is for re-use of IDs once they get + // freed. We only bump the max_allocation_id_ once there are no more free + // IDs. + std::vector free_allocation_ids_; + + absl::optional TryCreateTiledAllocation(uint64_t size_in_bytes); + absl::optional TryCreateUntiledAllocation(uint64_t size_in_bytes); + + friend class D3D12BufferRegion; + + absl::flat_hash_map> allocations_by_id_; + + // Retrieves a free allocation ID, or nullopt if no more IDs are available. + absl::optional TryReserveAllocationID(); + + // Releases an allocation ID back to the pool of IDs. + void ReleaseAllocationID(uint32_t id); }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp new file mode 100644 index 0000000000000..044e9e854d700 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlAllocationInfo.h" +#include "BucketizedBufferAllocator.h" + +namespace Dml +{ + + AllocationInfo::~AllocationInfo() + { + if (m_owner) + { + m_owner->FreeResource(this, m_pooledResourceId); + } + } + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h new file mode 100644 index 0000000000000..977de7c4887e2 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "DmlReservedResourceWrapper.h" + +namespace Dml +{ + class BucketizedBufferAllocator; + + class AllocationInfo : public Microsoft::WRL::RuntimeClass< + Microsoft::WRL::RuntimeClassFlags, IUnknown> + { + public: + AllocationInfo( + BucketizedBufferAllocator* owner, + size_t id, + uint64_t pooledResourceId, + DmlResourceWrapper* resourceWrapper, + size_t requestedSize) + : m_owner(owner) + , m_allocationId(id) + , m_pooledResourceId(pooledResourceId) + , m_resourceWrapper(resourceWrapper) + , m_requestedSize(requestedSize) + {} + + ~AllocationInfo(); + + BucketizedBufferAllocator* GetOwner() const + { + return m_owner; + } + + ID3D12Resource* GetUavResource() const + { + return m_resourceWrapper->GetUavResource(); + } + + ID3D12Resource* GetCopySrcResource() const + { + return m_resourceWrapper->GetCopySrcResource(); + } + + ID3D12Resource* GetCopyDstResource() const + { + return m_resourceWrapper->GetCopyDstResource(); + } + + ComPtr DetachResourceWrapper() const + { + return std::move(m_resourceWrapper); + } + + size_t GetRequestedSize() const + { + return m_requestedSize; + } + + size_t GetId() const + { + return m_allocationId; + } + + uint64_t GetPooledResourceId() const + { + return m_pooledResourceId; + } + + private: + BucketizedBufferAllocator* m_owner; + size_t m_allocationId; // For debugging purposes + uint64_t m_pooledResourceId = 0; + Microsoft::WRL::ComPtr m_resourceWrapper; + + // The size requested during Alloc(), which may be smaller than the physical resource size + size_t m_requestedSize; + }; +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h new file mode 100644 index 0000000000000..458a65e63c0c4 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/framework/allocator.h" +#include "BucketizedBufferAllocator.h" + +namespace Dml +{ + class DmlBfcAllocator : public onnxruntime::IAllocator + { + public: + DmlBfcAllocator(BucketizedBufferAllocator* subAllocator) + : onnxruntime::IAllocator( + OrtMemoryInfo( + "DML", + OrtAllocatorType::OrtDeviceAllocator, + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) + ) + ), + m_subAllocator(subAllocator) {} + + void* Alloc(size_t size_in_bytes) { return m_subAllocator->Alloc(size_in_bytes); } + void Free(void* ptr) { m_subAllocator->Free(ptr); } + private: + BucketizedBufferAllocator* m_subAllocator; + }; +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc new file mode 100644 index 0000000000000..3240042b5b6a6 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc @@ -0,0 +1,120 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlBufferRegion.h" + +namespace Dml +{ + + D3D12BufferRegion::D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state) + : resource_uav_state_(resource_uav_state), + resource_copy_src_state_(resource_copy_src_state), + resource_copy_dst_state_(resource_copy_dst_state), + offset_(offset), + size_in_bytes_(size_in_bytes) + { + // Get a raw pointer to the first non-null resource passed in. At least one + // resource must be provided. + first_valid_resource_ = resource_uav_state_; + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_src_state_; + } + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_dst_state_; + } + ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr); + + // Regions cannot be empty. + ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); + + // Regions cannot extend beyond the size of the resource. + uint64_t buffer_size = first_valid_resource_->GetDesc().Width; + ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); + ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); + + // All three resources, if provided, must be identical aside from state. + assert( + first_valid_resource_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER); + assert( + !resource_uav_state || + (resource_uav_state->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_uav_state->GetDesc().Width == buffer_size)); + assert( + !resource_copy_src_state_ || + (resource_copy_src_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_src_state_->GetDesc().Width == buffer_size)); + assert( + !resource_copy_dst_state_ || + (resource_copy_dst_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_dst_state_->GetDesc().Width == buffer_size)); + } + + D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) + { + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->offset_, that.offset_); + std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); + } + + D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) + { + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->offset_, that.offset_); + std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); + return *this; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const + { + return resource_uav_state_; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const + { + return resource_copy_src_state_; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const + { + return resource_copy_dst_state_; + } + + uint64_t D3D12BufferRegion::Offset() const + { + return first_valid_resource_ ? offset_ : 0; + } + + uint64_t D3D12BufferRegion::SizeInBytes() const + { + return first_valid_resource_ ? size_in_bytes_ : 0; + } + + DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const + { + if (!resource_uav_state_) + { + return DML_BUFFER_BINDING{}; + } + + return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; + } + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h new file mode 100644 index 0000000000000..29a6bf6f7c775 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace Dml +{ + // Represents a region of a D3D12 buffer resource. A buffer region has an + // underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in + // bytes from the beginning of that buffer, and a size in bytes of the region. + class D3D12BufferRegion + { + public: + D3D12BufferRegion() = default; + + // References a region of a buffer. The respective ID3D12Resource objects + // must be in the appropriate states. Each resource is optional, but if more + // than one are provided they must map to the same region of memory. + D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state); + + // Move-only + D3D12BufferRegion(const D3D12BufferRegion&) = delete; + D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete; + D3D12BufferRegion(D3D12BufferRegion&&); + D3D12BufferRegion& operator=(D3D12BufferRegion&&); + + ID3D12Resource* ResourceInUavState() const; + + // NOTE: may be any state that is valid as a copy source (COPY_SRC, + // GENERIC_READ, or COMMON). + ID3D12Resource* ResourceInCopySrcState() const; + + ID3D12Resource* ResourceInCopyDstState() const; + + uint64_t Offset() const; + uint64_t SizeInBytes() const; + + DML_BUFFER_BINDING GetBufferBinding() const; + + explicit operator bool() const { return first_valid_resource_ != nullptr; } + + // Creates a subregion at an offset from the start of this region. If no + // size is provided the region runs to the end of the current region. + inline D3D12BufferRegion Subregion( + uint64_t offset, + uint64_t size_in_bytes = 0) const + { + // start of subregion must be within current region + ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); + size_in_bytes = + size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + // end of subregion must be within current region + ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); + + return D3D12BufferRegion( + offset_ + offset, + size_in_bytes, + resource_uav_state_, + resource_copy_src_state_, + resource_copy_dst_state_); + } + + private: + ID3D12Resource* resource_uav_state_ = nullptr; + ID3D12Resource* resource_copy_src_state_ = nullptr; + ID3D12Resource* resource_copy_dst_state_ = nullptr; + uint64_t offset_ = 0; + uint64_t size_in_bytes_ = 0; + + // Pointer to the first resource above that isn't null. + ID3D12Resource* first_valid_resource_ = nullptr; + }; + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 7f2fdafbbeb60..22161a6a58cbf 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -5,6 +5,7 @@ #include "DmlCommandRecorder.h" #include "CommandQueue.h" #include "BucketizedBufferAllocator.h" +#include "absl/cleanup/cleanup.h" using namespace Dml; @@ -22,9 +23,14 @@ DmlCommandRecorder::DmlCommandRecorder( ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder))); } -void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) +void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) { - m_bufferAllocator = allocator; + m_allocator = allocator; +} + +void DmlCommandRecorder::SetSubAllocator(std::weak_ptr subAllocator) +{ + m_subAllocator = subAllocator; } void DmlCommandRecorder::InitializeOperator( @@ -57,26 +63,25 @@ void DmlCommandRecorder::InitializeOperator( UINT64 temporaryResourceSize = initBindingProps.TemporaryResourceSize; if (temporaryResourceSize > 0) { - auto allocator = m_bufferAllocator.lock(); + auto allocator = m_allocator.lock(); // Allocate and immediately free a temporary buffer. The buffer resource will still be // alive (managed by the pool); freeing allows the resource to be shared with other operators. - void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize), AllocatorRoundingMode::Enabled); - - - + void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize)); if (!tempResourceHandle) { ORT_THROW_HR(E_OUTOFMEMORY); } + absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); }); - ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource(); - allocator->Free(tempResourceHandle); + auto subAllocator = m_subAllocator.lock(); + auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize); // Bind the temporary resource. - DML_BUFFER_BINDING bufferBinding = { buffer, 0, temporaryResourceSize }; + DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding(); DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding }; bindingTable->BindTemporaryResource(&bindingDesc); + allocator->Free(tempResourceHandle); } // Bind inputs, if provided. @@ -138,21 +143,22 @@ void DmlCommandRecorder::ExecuteOperator( UINT64 temporaryResourceSize = execBindingProps.TemporaryResourceSize; if (temporaryResourceSize > 0) { - auto allocator = m_bufferAllocator.lock(); + auto allocator = m_allocator.lock(); // Allocate and immediately free a temporary buffer. The buffer resource will still be // alive (managed by the pool); freeing allows the resource to be shared with other operators. - void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize), AllocatorRoundingMode::Enabled); + void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize)); if (!tempResourceHandle) { ORT_THROW_HR(E_OUTOFMEMORY); } + absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); }); - ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource(); - allocator->Free(tempResourceHandle); + auto subAllocator = m_subAllocator.lock(); + auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize); // Bind the temporary resource. - DML_BUFFER_BINDING bufferBinding = { buffer, 0, temporaryResourceSize }; + DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding(); DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding }; bindingTable->BindTemporaryResource(&bindingDesc); } @@ -196,6 +202,7 @@ void DmlCommandRecorder::CopyBufferRegion( void DmlCommandRecorder::FillBufferWithPattern( ID3D12Resource* dstBuffer, + uint64_t offset, gsl::span value /* Data type agnostic value, treated as raw bits */) { // The fill pattern for ClearUnorderedAccessViewUint is 16 bytes. @@ -226,6 +233,7 @@ void DmlCommandRecorder::FillBufferWithPattern( D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; uavDesc.Format = DXGI_FORMAT_R32_TYPELESS; + uavDesc.Buffer.FirstElement = gsl::narrow(offset / sizeof(uint32_t)); uavDesc.Buffer.NumElements = gsl::narrow(dstBuffer->GetDesc().Width / sizeof(uint32_t)); uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h index 7ad7032317d77..2bf23062a49f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h @@ -5,6 +5,7 @@ #include "ICommandRecorder.h" #include "CommandAllocatorRing.h" +#include "core/framework/allocator.h" namespace Dml { @@ -16,7 +17,7 @@ namespace Dml public: DmlCommandRecorder( ID3D12Device* d3dDevice, - IDMLDevice* device, + IDMLDevice* device, std::shared_ptr commandQueue); void InitializeOperator( @@ -39,6 +40,7 @@ namespace Dml void FillBufferWithPattern( ID3D12Resource* dstBuffer, + uint64_t offset, gsl::span value /* Data type agnostic value, treated as raw bits */); void ExecuteCommandList( @@ -47,14 +49,15 @@ namespace Dml _Out_ uint64_t* completionValue); ComPtr GetCommandList(); - + void ResourceBarrier(gsl::span barriers); void AddUAVBarrier(); void Open() final; void CloseAndExecute() final; - - void SetAllocator(std::weak_ptr allocator); + + void SetAllocator(std::weak_ptr allocator); + void SetSubAllocator(std::weak_ptr allocator); bool HasUnsubmittedWork() override { @@ -81,7 +84,8 @@ namespace Dml ID3D12DescriptorHeap* m_currentDescriptorHeap = nullptr; // The weak pointer avoids a circular reference from context->recorder->allocator->context - std::weak_ptr m_bufferAllocator; + std::weak_ptr m_allocator; + std::weak_ptr m_subAllocator; CommandAllocatorRing<2> m_commandAllocatorRing; @@ -89,7 +93,7 @@ namespace Dml ComPtr m_currentCommandList; bool m_operationsRecordedInCurrentCommandList = false; - // Command lists which have been batched up for execution. The values in + // Command lists which have been batched up for execution. The values in // m_pendingCommandListsCacheable indicate whether they can be moved into this // class's cache after execution, versus if they belong to the caller and were // passed to ExecuteCommandList. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp new file mode 100644 index 0000000000000..a9ba854a45747 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlCpuAllocator.h" + +namespace Dml +{ + +DmlCpuAllocator::DmlCpuAllocator(OrtMemType memType) + : onnxruntime::IAllocator( + OrtMemoryInfo( + "DML CPU", + OrtAllocatorType::OrtDeviceAllocator, + OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0), + 0, + memType + ) + ) +{ +} + +void* DmlCpuAllocator::Alloc(size_t size) +{ + if (size <= 0) + { + return nullptr; + } + void* p = malloc(size); + return p; +} + +void DmlCpuAllocator::Free(void* p) +{ + free(p); +} + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h new file mode 100644 index 0000000000000..2f81975d2c4cd --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/framework/allocator.h" + +namespace Dml +{ + +class DmlCpuAllocator : public onnxruntime::IAllocator +{ +public: + explicit DmlCpuAllocator(OrtMemType memType); + + void* Alloc(size_t size) override; + void Free(void* p) override; +}; + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h new file mode 100644 index 0000000000000..554a4dca8e550 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/framework/allocator.h" +#include "BucketizedBufferAllocator.h" + +namespace Dml +{ + class DmlGpuAllocator : public onnxruntime::IAllocator + { + public: + DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, BucketizedBufferAllocator* subAllocator) + : onnxruntime::IAllocator( + OrtMemoryInfo( + "DML", + OrtAllocatorType::OrtDeviceAllocator, + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) + ) + ), + m_bfcAllocator(bfcAllocator), + m_subAllocator(subAllocator) {} + + void* Alloc(size_t size_in_bytes) { return m_bfcAllocator->Alloc(size_in_bytes); } + void Free(void* ptr) { m_bfcAllocator->Free(ptr); } + + BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator; } + + private: + // This allocator is managed by ORT and should be used to allocate/free memory in order + // to utilize the BFC acapabilities + onnxruntime::IAllocator* m_bfcAllocator; + + // This allocator is specific to DML and is used to decode the opaque data returned by the BFC + // allocator into objects that DML understands + BucketizedBufferAllocator* m_subAllocator; + }; +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index 9d0ba9dc7ea51..890c5aa1ae384 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -94,11 +94,11 @@ namespace DmlGraphFusionHelper ID3D12Resource** resource, uint64_t* allocId) { - IUnknown* allocationUnk = static_cast(const_cast(tensor->DataRaw())); + void* opaqueData = const_cast(tensor->DataRaw()); Microsoft::WRL::ComPtr resourceUnk; - winmlProvider->GetABIDataInterface(false, allocationUnk, &resourceUnk); + winmlProvider->GetABIDataInterface(opaqueData, &resourceUnk); - *allocId = winmlProvider->TryGetPooledAllocationId(allocationUnk, 0); + *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0); ORT_THROW_IF_FAILED(resourceUnk->QueryInterface(resource)); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h new file mode 100644 index 0000000000000..6de78a47b6d8b --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace Dml +{ + struct DmlHeapAllocation + { + Microsoft::WRL::ComPtr heap; + + // Heaps backing the memory for the allocation. If tiling is supported + // an allocation may comprise multiple heaps. If tiling is not supported + // an allocation will only have a single heap. + std::vector> heaps; + + // Resources created over this allocation's heaps. All three resources + // are identical aside from being fixed in a single resource state: UAV, + // COPY_SRC, and COPY_DST respectively. The purpose of duplicate + // resources is to enable overlapping resources in different states for + // copying data. Most callers will not (and should not) interact + // directly with these resources; all three are wrapped by the buffer + // regions returned from this allocator, and the appropriate resource + // will be used automatically when performing buffer copies. + Microsoft::WRL::ComPtr resource_uav_state; + Microsoft::WRL::ComPtr resource_copy_src_state; + Microsoft::WRL::ComPtr resource_copy_dst_state; + }; +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp deleted file mode 100644 index bdda99ae6f91a..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "precomp.h" -#include "DmlHeapAllocator.h" -#include "DmlReservedResourceWrapper.h" - -namespace Dml -{ - -static bool GetTilingEnabled(ID3D12Device* device) -{ - D3D12_FEATURE_DATA_D3D12_OPTIONS options = {}; - if (SUCCEEDED(device->CheckFeatureSupport( - D3D12_FEATURE_D3D12_OPTIONS, - &options, - sizeof(options)))) - { - return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1; - } - - return false; -} - -static uint64_t GetMaxHeapSizeInTiles() -{ - return D3D12HeapAllocator::kDefaultMaxHeapSizeInTiles; -} - -D3D12HeapAllocator::D3D12HeapAllocator( - ID3D12Device* device, - ID3D12CommandQueue* queue, - const D3D12_HEAP_PROPERTIES& heap_props, - D3D12_HEAP_FLAGS heap_flags, - D3D12_RESOURCE_FLAGS resource_flags, - D3D12_RESOURCE_STATES initial_state) - : device_(device), - queue_(queue), - heap_properties_(heap_props), - heap_flags_(heap_flags), - resource_flags_(resource_flags), - initial_state_(initial_state), - tiling_enabled_(GetTilingEnabled(device)), - max_heap_size_in_tiles_(GetMaxHeapSizeInTiles()) -{ -} - -absl::optional D3D12HeapAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes) -{ - Allocation allocation = {}; - - // The allocation may be larger than the requested size to ensure a whole - // number of tiles. - const uint64_t resource_size_in_tiles = - 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - const uint64_t resource_size_in_bytes = - resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - auto resource_desc = - CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_); - - ID3D12Resource** resources[] = { - &allocation.resource_uav_state, - &allocation.resource_copy_src_state, - &allocation.resource_copy_dst_state}; - - D3D12_RESOURCE_STATES states[] = { - initial_state_, - D3D12_RESOURCE_STATE_COPY_SOURCE, - D3D12_RESOURCE_STATE_COPY_DEST}; - - for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) - { - HRESULT create_resource_hr = device_->CreateReservedResource( - &resource_desc, - states[i], - nullptr, - IID_PPV_ARGS(resources[i])); - - if (create_resource_hr == E_OUTOFMEMORY) - { - return absl::nullopt; - } - ORT_THROW_IF_FAILED(create_resource_hr); - } - - // Reserve enough heaps to store all tiles in the resource. - const uint64_t heap_count = - 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_; - allocation.heaps.resize(heap_count); - - // Create heaps and map them to the primary reserved resource. - D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {}; - uint64_t unmapped_resource_tiles = resource_size_in_tiles; - for (uint64_t i = 0; i < heap_count; i++) - { - // Create heap. The last heap of the allocation may have fewer tiles to - // avoid wasting space. - uint64_t heap_size_in_tiles = std::min( - unmapped_resource_tiles, - max_heap_size_in_tiles_); - uint64_t heap_size_in_bytes = - heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - auto heap_desc = CD3DX12_HEAP_DESC( - heap_size_in_bytes, - heap_properties_, - 0, - heap_flags_); - - HRESULT create_heap_hr = - device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); - if (create_heap_hr == E_OUTOFMEMORY) - { - return absl::nullopt; - } - ORT_THROW_IF_FAILED(create_heap_hr); - - // Source region in the resource to map. - D3D12_TILE_REGION_SIZE resource_region_size = {}; - resource_region_size.NumTiles = static_cast(heap_size_in_tiles); - - // Target range in the current heap to map. - const D3D12_TILE_RANGE_FLAGS tile_range_flags = - D3D12_TILE_RANGE_FLAG_NONE; - const uint32_t heap_range_start_offset = 0; - const uint32_t heap_range_tile_count = static_cast(heap_size_in_tiles); - - constexpr uint32_t numResourceRegions = 1; - constexpr uint32_t numHeapRanges = 1; - - // This is a brand new allocation/resource, so the tile mappings are - // guaranteed to be set (on the GPU timeline) by the time any code can - // reference the returned resource. We only execute operations on a - // single hardware queue so there is no need to wait or signal. - // - // All resources have identical tile mappings. The repeated call to - // UpdateTileMappings on all resources instead of using CopyTileMappings - // is intentional: the latter API is not supported by all versions of - // PIX. - for (auto resource : - {allocation.resource_uav_state.Get(), - allocation.resource_copy_src_state.Get(), - allocation.resource_copy_dst_state.Get()}) - { - queue_->UpdateTileMappings( - resource, - numResourceRegions, - &resource_region_start_coordinates, - &resource_region_size, - allocation.heaps[i].Get(), - numHeapRanges, - &tile_range_flags, - &heap_range_start_offset, - &heap_range_tile_count, - D3D12_TILE_MAPPING_FLAG_NONE); - } - - resource_region_start_coordinates.X += static_cast(heap_size_in_tiles); - unmapped_resource_tiles -= heap_size_in_tiles; - } - - assert(unmapped_resource_tiles == 0); - - return allocation; -} - -absl::optional D3D12HeapAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes) -{ - Allocation allocation = {}; - - // Create the allocation's sole heap. The allocation may be larger than the - // requested size to ensure a whole number of tiles. - allocation.heaps.resize(1); - D3D12_HEAP_DESC heap_desc = - CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_); - HRESULT create_heap_hr = device_->CreateHeap( - &heap_desc, - IID_PPV_ARGS(&allocation.heaps.front())); - if (create_heap_hr == E_OUTOFMEMORY) - { - return absl::nullopt; - } - - // Create large placed resource that spans the heap. - D3D12_RESOURCE_DESC resource_desc = - CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_); - - ID3D12Resource** resources[] = { - &allocation.resource_uav_state, - &allocation.resource_copy_src_state, - &allocation.resource_copy_dst_state}; - D3D12_RESOURCE_STATES states[] = { - initial_state_, - D3D12_RESOURCE_STATE_COPY_SOURCE, - D3D12_RESOURCE_STATE_COPY_DEST}; - - for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) - { - HRESULT create_resource_hr = device_->CreatePlacedResource( - allocation.heaps.front().Get(), - 0, - &resource_desc, - states[i], - nullptr, - IID_PPV_ARGS(resources[i])); - if (create_resource_hr == E_OUTOFMEMORY) - { - return absl::nullopt; - } - ORT_THROW_IF_FAILED(create_resource_hr); - } - - return allocation; -} - -uint64_t D3D12HeapAllocator::ComputeRequiredSize(size_t size) -{ - const uint64_t resource_size_in_tiles = - 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - const uint64_t resource_size_in_bytes = - resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - - return resource_size_in_bytes; -} - -Microsoft::WRL::ComPtr D3D12HeapAllocator::Alloc(size_t size_in_bytes) -{ - if (size_in_bytes == 0) - { - return nullptr; - } - - // The D3D12 device is thread-safe so we don't need to hold the lock while - // creating an allocation. - absl::optional allocation = - tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes) - : TryCreateUntiledAllocation(size_in_bytes); - - ORT_THROW_HR_IF(E_UNEXPECTED, !allocation); - - auto reservedResourceWrapper = wil::MakeOrThrow(std::move(*allocation)); - Microsoft::WRL::ComPtr resourceWrapper; - reservedResourceWrapper.As(&resourceWrapper); - return resourceWrapper; -} - -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h deleted file mode 100644 index 6e13ad71f5877..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "DmlResourceWrapper.h" - -namespace Dml -{ - -struct Allocation -{ - Microsoft::WRL::ComPtr heap; - - // Heaps backing the memory for the allocation. If tiling is supported - // an allocation may comprise multiple heaps. If tiling is not supported - // an allocation will only have a single heap. - std::vector> heaps; - - // Resources created over this allocation's heaps. All three resources - // are identical aside from being fixed in a single resource state: UAV, - // COPY_SRC, and COPY_DST respectively. The purpose of duplicate - // resources is to enable overlapping resources in different states for - // copying data. Most callers will not (and should not) interact - // directly with these resources; all three are wrapped by the buffer - // regions returned from this allocator, and the appropriate resource - // will be used automatically when performing buffer copies. - Microsoft::WRL::ComPtr resource_uav_state; - Microsoft::WRL::ComPtr resource_copy_src_state; - Microsoft::WRL::ComPtr resource_copy_dst_state; -}; - -// An allocator that makes logically contiguous allocations backed by D3D heaps. -// -// Heaps must fit entirely in either local or non-local memory. Larger heaps -// have a greater chance of getting demoted into non-local memory, which can be -// disastrous for performance. This problem is compounded by the fact that heaps -// may be demoted even if overall local memory usage is within the process' -// budget. Heaps are not necessarily mappable to discontiguous regions of -// physical memory, which means physical memory fragmentation *may* make it -// extremely difficult to accommodate larger heaps. -// -// On D3D hardware that supports tiled resource tier 1+ this class implements -// large allocations through tiling. Each allocation is backed by however many -// small heaps are necessary to cover the requested allocation size. Buffer -// regions retrieved through this allocator are reserved resources that span the -// full collection of heaps assigned to an individual allocation. Tile mappings -// are static. -// -// On hardware that doesn't support tiled resources each allocation is backed by -// a single heap. Buffer regions retrieved through this allocator are placed -// resources that span the full heap assigned to an individual allocation. In -// this case it is better make more but smaller allocations (resulting in -// smaller heaps); this fallback path is only retained as a last resort for -// older hardware. -class D3D12HeapAllocator -{ - public: - // Maximum size of a heap (in tiles) when allocations are tiled. Each tile - // is 64KB. A default size of 512 tiles (32MB) does a good job of handling - // local video memory fragmentation without requiring lots of heaps. - static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512; - - D3D12HeapAllocator( - ID3D12Device* device, - ID3D12CommandQueue* queue, - const D3D12_HEAP_PROPERTIES& heap_props, - D3D12_HEAP_FLAGS heap_flags, - D3D12_RESOURCE_FLAGS resource_flags, - D3D12_RESOURCE_STATES initial_state); - - Microsoft::WRL::ComPtr Alloc(size_t size_in_bytes); - uint64_t ComputeRequiredSize(size_t size); - bool TilingEnabled() const { return tiling_enabled_; }; - - private: - std::mutex mutex_; - - Microsoft::WRL::ComPtr device_; - Microsoft::WRL::ComPtr queue_; - const D3D12_HEAP_PROPERTIES heap_properties_; - const D3D12_HEAP_FLAGS heap_flags_; - const D3D12_RESOURCE_FLAGS resource_flags_; - const D3D12_RESOURCE_STATES initial_state_; - bool tiling_enabled_; - uint64_t max_heap_size_in_tiles_; - - private: - absl::optional TryCreateTiledAllocation(uint64_t size_in_bytes); - absl::optional TryCreateUntiledAllocation( - uint64_t size_in_bytes); - - friend class D3D12BufferRegion; -}; - -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h new file mode 100644 index 0000000000000..de39f0890f998 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "DmlBufferRegion.h" +#include "DmlAllocationInfo.h" + +namespace Dml +{ + class DmlManagedBufferRegion : public Microsoft::WRL::RuntimeClass, IUnknown> + { + public: + DmlManagedBufferRegion(Microsoft::WRL::ComPtr allocation, D3D12BufferRegion&& bufferRegion) + : m_allocation(std::move(allocation)), + m_bufferRegion(std::move(bufferRegion)) + { + } + + const D3D12BufferRegion& GetBufferRegion() const { return m_bufferRegion; } + + private: + Microsoft::WRL::ComPtr m_allocation; + D3D12BufferRegion m_bufferRegion; + }; +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 413ade92daf51..68feab568ca45 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -1,15 +1,22 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#pragma once + #include "DmlResourceWrapper.h" -#include "DmlHeapAllocator.h" +#include "DmlHeapAllocation.h" +#include "DmlTaggedPointer.h" namespace Dml { class DmlReservedResourceWrapper : public Microsoft::WRL::RuntimeClass, DmlResourceWrapper> { public: - DmlReservedResourceWrapper(Allocation&& allocation) : m_allocation(std::move(allocation)) {} + DmlReservedResourceWrapper(DmlHeapAllocation&& allocation) + : m_allocation(std::move(allocation)) + { + } + ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } @@ -19,6 +26,6 @@ namespace Dml D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: - Allocation m_allocation; + DmlHeapAllocation m_allocation; }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp new file mode 100644 index 0000000000000..da5ed6df2ff4c --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "DmlTaggedPointer.h" +#include + +namespace Dml +{ +/*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr) +{ + uint64_t ptr_val = reinterpret_cast(ptr); + + static constexpr uint64_t kAllocationIDMask = + (1ull << kAllocationIDBits) - 1; + static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1; + + TaggedPointer tagged_ptr; + tagged_ptr.device_id = (ptr_val >> (kAllocationIDBits + kOffsetBits)); + tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask; + tagged_ptr.offset = (ptr_val & kOffsetMask); + + return tagged_ptr; +} + +/*static*/ void* TaggedPointer::Pack( + uint32_t device_id, + uint32_t allocation_id, + uint64_t offset) +{ + assert(device_id < (1ull << kDeviceIDBits)); + assert(allocation_id < (1ull << kAllocationIDBits)); + assert(offset < (1ull << kOffsetBits)); + + // Store the device ID in the upper bits of the pointer, followed by the + // allocation id and the offset in the lower bits + uint64_t ptr = ((uint64_t)device_id << (kAllocationIDBits + kOffsetBits)) | + ((uint64_t)allocation_id << kOffsetBits) | offset; + + return reinterpret_cast(ptr); +} +} // namespace tfdml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h new file mode 100644 index 0000000000000..96b0eb318ad48 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +namespace Dml +{ + +// D3D12HeapAllocator and D3D12DescriptorHeapAllocator encode the allocation ID +// into the high bits of the pointers it returns, while the low bits are used as +// an offset into the allocation. Note that since the layout of bitfields is +// implementation-defined, you can't just cast a void* into a TaggedPointer: it +// must be done using masks and shifts. +struct TaggedPointer +{ + static constexpr uint64_t kDeviceIDBits = 4; + static constexpr uint64_t kAllocationIDBits = 20; + static constexpr uint64_t kOffsetBits = 40; + + uint64_t device_id : kDeviceIDBits; + uint64_t allocation_id : kAllocationIDBits; + uint64_t offset : kOffsetBits; + + static void* Pack( + uint32_t device_id, + uint32_t allocation_id, + uint64_t offset); + static TaggedPointer Unpack(const void* ptr); +}; + +static_assert( + sizeof(TaggedPointer) == sizeof(void*), + "DML requires a 64-bit architecture"); +static_assert( + TaggedPointer::kDeviceIDBits + TaggedPointer::kAllocationIDBits + + TaggedPointer::kOffsetBits == + sizeof(void*) * CHAR_BIT, + "DML requires a 64-bit architecture"); + +} // namespace tfdml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index 1d41d26cf0062..c3415c4b9ea49 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -18,7 +18,7 @@ namespace Dml ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf()))); } - void ExecutionContext::SetAllocator(std::weak_ptr allocator) + void ExecutionContext::SetAllocator(std::weak_ptr allocator) { m_dmlRecorder.SetAllocator(allocator); } @@ -68,10 +68,11 @@ namespace Dml void ExecutionContext::FillBufferWithPattern( ID3D12Resource* dstBuffer, + uint64_t offset, gsl::span value /* Data type agnostic value, treated as raw bits */) { SetCommandRecorder(&m_dmlRecorder); - m_dmlRecorder.FillBufferWithPattern(dstBuffer, value); + m_dmlRecorder.FillBufferWithPattern(dstBuffer, offset, value); } void ExecutionContext::ExecuteCommandList( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h index b06f11a5efd0a..6625ae83ffd1e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h @@ -20,13 +20,13 @@ namespace Dml public: // Constructs an ExecutionContext that executes on the supplied queue. ExecutionContext( - ID3D12Device* d3d12Device, - IDMLDevice* dmlDevice, + ID3D12Device* d3d12Device, + IDMLDevice* dmlDevice, ID3D12CommandQueue* queue); - void SetAllocator(std::weak_ptr allocator); + void SetAllocator(std::weak_ptr allocator); - // Waits for flushed work, discards unflushed work, and discards associated references to + // Waits for flushed work, discards unflushed work, and discards associated references to // prevent circular references. Must be the last call on the object before destruction. void Close(); @@ -44,6 +44,7 @@ namespace Dml void FillBufferWithPattern( ID3D12Resource* dstBuffer, + uint64_t offset, gsl::span value /* Data type agnostic value, treated as raw bits */); void InitializeOperator( @@ -75,12 +76,12 @@ namespace Dml // Returns an event which will become signaled when everything submitted to the execution context thus far has // completed execution on the GPU, including work that has yet to be flushed to the queue. GpuEvent GetCurrentCompletionEvent(); - + // Adds a reference which will be released when queued GPU work is completed void QueueReference(IUnknown* object); // Release any accumulated references who corresponding GPU fence values have - // been reached. + // been reached. void ReleaseCompletedReferences(); D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index fddd3267d9770..ca9080e4fe665 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -9,6 +9,7 @@ #include "ReadbackHeap.h" #include "ExecutionContext.h" #include "BucketizedBufferAllocator.h" +#include "DmlCpuAllocator.h" #include "MLOperatorAuthorImpl.h" #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h" #include "core/providers/dml/OperatorAuthorHelper/OperatorHelper.h" @@ -18,7 +19,10 @@ #include "core/framework/compute_capability.h" #include "core/framework/fallback_cpu_capability.h" #include "DmlCommittedResourceWrapper.h" -#include "DmlHeapAllocator.h" +#include "DmlBufferRegion.h" +#include "DmlManagedBufferRegion.h" +#include "DmlBfcAllocator.h" +#include "DmlGpuAllocator.h" #ifdef ERROR #undef ERROR @@ -111,32 +115,32 @@ namespace Dml HRESULT __stdcall ExecutionProviderImpl::AllocatePooledResource( size_t size, - AllocatorRoundingMode roundingMode, - ID3D12Resource **d3dResource, - IUnknown** pooledResource + DmlManagedBufferRegion** managedBufferRegion ) const noexcept { ORT_TRY { - ComPtr allocation; - allocation.Attach(static_cast(m_allocator->Alloc(size, roundingMode))); - - const auto* allocInfo = m_allocator->DecodeDataHandle(allocation.Get()); - - ComPtr resource = allocInfo->GetUavResource(); - resource.CopyTo(d3dResource); - *pooledResource = allocation.Detach(); + void* opaqueData = m_bfcAllocator->Alloc(size); + auto bufferRegion = m_subAllocator->CreateManagedBufferRegion(opaqueData, size); + bufferRegion.CopyTo(managedBufferRegion); return S_OK; } ORT_CATCH_RETURN } - ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(void* allocation) const noexcept + D3D12BufferRegion ExecutionProviderImpl::GetBufferForTensor(IMLOperatorTensor* tensor) const + { + MLOperatorTensor mlOperatorTensor(tensor); + void* data = mlOperatorTensor.GetByteData(); + auto sizeInBytes = mlOperatorTensor.GetUnalignedTensorByteSize(); + return m_subAllocator->CreateBufferRegion(data, sizeInBytes); + } + + ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept { ORT_TRY { - const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(allocation); - return allocInfo->GetUavResource(); + return GetBufferForTensor(tensor).ResourceInUavState(); } ORT_CATCH_GENERIC { @@ -178,7 +182,7 @@ namespace Dml m_context = std::make_shared(m_d3d12Device.Get(), m_dmlDevice.Get(), queue); - auto heapAllocator = std::make_unique( + m_subAllocator = std::make_shared( m_d3d12Device.Get(), queue, CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), @@ -186,21 +190,25 @@ namespace Dml D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - // Create an allocator for D3D12 buffers used to hold tensor data. The returned buffers from the allocator - // should be DEFAULT heap buffers which can be used as UAVs, and which start in UAV state. - m_allocator = std::make_shared( - m_d3d12Device.Get(), - m_context, - std::move(heapAllocator)); + // Create a BFC allocator that encapsulates our allocator + onnxruntime::AllocatorCreationInfo memoryInfo( + [this](OrtDevice::DeviceId id) { + return std::make_unique(m_subAllocator.get()); + }); + + m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo); + + // Wrap the BFC allocator into our own allocator + m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_subAllocator.get()); - m_context->SetAllocator(m_allocator); + m_context->SetAllocator(m_bfcAllocator); m_uploadHeap = std::make_unique(m_d3d12Device.Get(), m_context); m_readbackHeap = std::make_unique(m_d3d12Device.Get(), m_context); // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators. - m_cpuInputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUInput); - m_cpuOutputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUOutput); + m_cpuInputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUInput); + m_cpuOutputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUOutput); CreateDmlKernelRegistry(&m_kernelRegistry, &m_internalRegInfoMap); } @@ -341,10 +349,8 @@ namespace Dml if (tensor) { assert(tensor->IsDataInterface()); - const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(tensor).GetDataInterface().Get()); - ID3D12Resource* resource = allocInfo->GetUavResource(); - D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc(); - bufferBindings.push_back({ resource, 0, resourceDesc.Width }); + auto bufferRegion = GetBufferForTensor(tensor); + bufferBindings.push_back(bufferRegion.GetBufferBinding()); bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); } else @@ -431,46 +437,61 @@ namespace Dml // // CPU -> GPU copy (upload) // - const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get()); + auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource(); - const auto dstState = dstAllocInfo->GetDefaultCopyDstState(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? dstBufferRegion.ResourceInUavState() + : dstBufferRegion.ResourceInCopyDstState(); - const void* srcData = src->GetData(); + const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; - const uint64_t dstOffset = 0; - m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(srcData, dataSizeInBytes)); + const uint64_t dstOffset = dstBufferRegion.Offset(); + m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); } else if (!src->IsCpuData() && dst->IsCpuData()) { // // GPU -> CPU copy (readback) // + auto srcBufferRegion = GetBufferForTensor(src); - void* dstData = dst->GetData(); - const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get()); + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); - ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource(); - const auto srcState = srcAllocInfo->GetDefaultCopySrcState(); + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; - const uint64_t srcOffset = 0; - - // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(AsByteSpan(dstData, dataSizeInBytes), srcData, srcOffset, srcState); + const uint64_t srcOffset = srcBufferRegion.Offset(); + m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); } else if (!src->IsCpuData() && !dst->IsCpuData()) { // // GPU -> GPU copy // - const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get()); - const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get()); + auto srcBufferRegion = GetBufferForTensor(src); + + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; + + auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource(); - const auto srcState = srcAllocInfo->GetDefaultCopySrcState(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? dstBufferRegion.ResourceInUavState() + : dstBufferRegion.ResourceInCopyDstState(); - ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource(); - const auto dstState = dstAllocInfo->GetDefaultCopyDstState(); + const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes); } @@ -495,9 +516,8 @@ namespace Dml auto mlTensor = MLOperatorTensor(dst).GetDataInterface(); if (mlTensor != nullptr) { - const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(mlTensor.Get()); - ID3D12Resource* dstData = dstAllocInfo->GetUavResource(); - m_context->FillBufferWithPattern(dstData, rawValue); + auto dstBufferRegion = GetBufferForTensor(dst); + m_context->FillBufferWithPattern(dstBufferRegion.ResourceInUavState(), dstBufferRegion.Offset(), rawValue); } return S_OK; @@ -747,6 +767,9 @@ namespace Dml std::vector srcStates; srcStates.reserve(src_dst_pairs.size()); + std::vector srcOffsets; + srcOffsets.reserve(src_dst_pairs.size()); + std::vector dstDatas; dstDatas.reserve(src_dst_pairs.size()); @@ -790,19 +813,24 @@ namespace Dml ORT_THROW_HR_IF(E_INVALIDARG, dataSizesInBytes[i] != ComputeByteSizeFromTensor(srcWrapper)); // Tensors must be the same size dstDatas.push_back(dstWrapper.GetData()); - const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(&srcWrapper).GetDataInterface().Get()); - auto srcData = srcAllocInfo->GetCopySrcResource(); - auto srcState = srcAllocInfo->GetDefaultCopySrcState(); + auto srcBufferRegion = GetBufferForTensor(&srcWrapper); + + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; srcDatas.push_back(srcData); srcStates.push_back(srcState); + srcOffsets.push_back(srcBufferRegion.Offset()); } - const uint64_t srcOffset = 0; - // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcStates); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); return onnxruntime::common::Status::OK(); } @@ -815,7 +843,7 @@ namespace Dml void ExecutionProviderImpl::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { - m_allocator->SetDefaultRoundingMode(roundingMode); + m_subAllocator->SetDefaultRoundingMode(roundingMode); } void ExecutionProviderImpl::ReleaseCompletedReferences() @@ -840,36 +868,22 @@ namespace Dml data->AddRef(); } - void ExecutionProviderImpl::GetABIDataInterface( - bool isInternalOperator, - IUnknown* data, - IUnknown** abiData) const + void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const { assert(!m_closed); + *abiData = m_subAllocator->GetAllocationInfo(data)->GetUavResource(); + } - if (isInternalOperator) - { - *abiData = data; - data->AddRef(); - } - else - { -#ifdef _GAMING_XBOX - ComPtr wrappedResource = Microsoft::WRL::Make(m_allocator->DecodeDataHandle(data)->GetUavResource()); - *abiData = wrappedResource.Detach(); -#else - ComPtr resource = m_allocator->DecodeDataHandle(data)->GetUavResource(); - *abiData = resource.Detach(); -#endif - } + void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const + { + auto managedBufferRegion = m_subAllocator->CreateManagedBufferRegion(data, size); + ORT_THROW_IF_FAILED(managedBufferRegion.CopyTo(abiData)); } - uint64_t ExecutionProviderImpl::TryGetPooledAllocationId( - IUnknown* data, - bool isInternalOperator) + uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator) { assert(!isInternalOperator); - return m_allocator->DecodeDataHandle(data)->GetPooledResourceId(); + return m_subAllocator->GetAllocationInfo(data)->GetPooledResourceId(); } void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState( @@ -957,7 +971,7 @@ namespace Dml std::shared_ptr ExecutionProviderImpl::GetGpuAllocator() { - return m_allocator; + return m_bfcAllocator; } std::shared_ptr ExecutionProviderImpl::GetCpuInputAllocator() @@ -994,8 +1008,8 @@ namespace Dml ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr) { - Dml::BucketizedBufferAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->DecodeDataHandle(ptr)->GetUavResource(); + Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); + return pAllocationInfo->GetSubAllocator()->GetAllocationInfo(ptr)->GetUavResource(); } void FlushContext(onnxruntime::IExecutionProvider* provider) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 048230f12723a..22a9aed5dfd48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -5,6 +5,7 @@ #include "GraphTransformer.h" #include "core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h" +#include "DmlBufferRegion.h" #include #include @@ -23,8 +24,10 @@ namespace Dml class ReadbackHeap; class ExecutionContext; class BucketizedBufferAllocator; - class CPUAllocator; + class DmlCpuAllocator; class ExecutionProvider; + class DmlManagedBufferRegion; + class DmlGpuAllocator; class ExecutionProviderImpl : public WRL::Base @@ -100,13 +103,15 @@ namespace Dml IUnknown** dataCopy) const override; void GetABIDataInterface( - bool isInternalOperator, - IUnknown* data, + void* data, IUnknown** abiData) const override; - uint64_t TryGetPooledAllocationId( - IUnknown* data, - bool isInternalOperator) override; + void GetManagedBufferRegion( + void* data, + uint64_t size, + DmlManagedBufferRegion** abiData) const; + + uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override; void GetABIExecutionInterfaceAndInvalidateState( bool isInternalOperator, @@ -136,12 +141,10 @@ namespace Dml // Allocate a resource from pools. Releasing pooledResource returns it to the pool. STDMETHOD(AllocatePooledResource)( size_t size, - AllocatorRoundingMode roundingMode, - ID3D12Resource **d3dResource, - IUnknown* *pooledResource + DmlManagedBufferRegion** managedBufferRegion ) const noexcept final; - STDMETHOD_(ID3D12Resource*, DecodeResource)(void* allocation) const noexcept final; + STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept final; std::shared_ptr GetKernelRegistry() const { @@ -179,6 +182,8 @@ namespace Dml uint32_t supportedDeviceDataTypeMask // Each bit corresponds to each DML_TENSOR_DATA_TYPE. ) const; + D3D12BufferRegion GetBufferForTensor(IMLOperatorTensor* tensor) const; + ComPtr m_d3d12Device; ComPtr m_dmlDevice; bool m_isMcdmDevice = false; @@ -186,9 +191,11 @@ namespace Dml std::shared_ptr m_context; std::unique_ptr m_uploadHeap; std::unique_ptr m_readbackHeap; - std::shared_ptr m_allocator; - std::shared_ptr m_cpuInputAllocator; - std::shared_ptr m_cpuOutputAllocator; + std::shared_ptr m_bfcAllocator; + std::shared_ptr m_subAllocator; + std::shared_ptr m_gpuAllocator; + std::shared_ptr m_cpuInputAllocator; + std::shared_ptr m_cpuOutputAllocator; std::shared_ptr m_kernelRegistry; std::shared_ptr m_internalRegInfoMap; mutable uint64_t m_partitionKernelPrefixVal = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index e809a20cc0f4b..8ff33debe2474 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -6,6 +6,7 @@ #include "MLOperatorAuthorImpl.h" #include "FusedGraphKernel.h" #include "DmlGraphFusionHelper.h" +#include "DmlManagedBufferRegion.h" using namespace Windows::AI::MachineLearning::Adapter; @@ -63,13 +64,14 @@ namespace Dml UINT64 persistentResourceSize = m_compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { + ComPtr managedBufferRegion; ORT_THROW_IF_FAILED(m_provider->AllocatePooledResource( static_cast(persistentResourceSize), - AllocatorRoundingMode::Disabled, - m_persistentResource.GetAddressOf(), - m_persistentResourceAllocatorUnk.GetAddressOf())); + managedBufferRegion.GetAddressOf())); - m_persistentResourceBinding = DML_BUFFER_BINDING { m_persistentResource.Get(), 0, persistentResourceSize }; + managedBufferRegion.As(&m_persistentResourceAllocatorUnk); + m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); + m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); } ORT_THROW_IF_FAILED(m_provider->InitializeOperator( @@ -128,7 +130,7 @@ namespace Dml else if (!m_isInputsUploadedByDmlEP[i]) { ORT_THROW_IF_FAILED(contextWrapper.GetInputTensor(i, inputTensors[i].GetAddressOf())); - inputPtrs[i] = m_provider->DecodeResource(MLOperatorTensor(inputTensors[i].Get()).GetDataInterface().Get()); + inputPtrs[i] = m_provider->DecodeResource(inputTensors[i].Get()); } } @@ -166,7 +168,7 @@ namespace Dml if (tensor) { assert(tensor->IsDataInterface()); - ID3D12Resource* resource = m_provider->DecodeResource(MLOperatorTensor(tensor).GetDataInterface().Get()); + ID3D12Resource* resource = m_provider->DecodeResource(tensor); D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc(); bufferBindings.push_back({ resource, 0, resourceDesc.Width }); bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); @@ -363,13 +365,11 @@ namespace Dml uint64_t tempAllocId = 0; ORT_THROW_IF_FAILED(contextWrapper.AllocateTemporaryData(static_cast(execBindingProps.TemporaryResourceSize), tempAlloc.GetAddressOf(), &tempAllocId)); - ComPtr tempResourceUnk; - m_winmlProvider->GetABIDataInterface(false, tempAlloc.Get(), &tempResourceUnk); + ComPtr managedBufferRegion; + m_winmlProvider->GetManagedBufferRegion(tempAlloc.Get(), execBindingProps.TemporaryResourceSize, &managedBufferRegion); // Bind the temporary resource. - ComPtr tempResource; - ORT_THROW_IF_FAILED(tempResourceUnk->QueryInterface(tempResource.GetAddressOf())); - DML_BUFFER_BINDING tempBufferBinding = {tempResource.Get(), 0, execBindingProps.TemporaryResourceSize}; + DML_BUFFER_BINDING tempBufferBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); DML_BINDING_DESC tempBindingDesc = { DML_BINDING_TYPE_BUFFER, &tempBufferBinding }; if (!tempAllocId || m_tempBindingAllocId != tempAllocId) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h index b4baf62ab73f5..4bef0652763a9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h @@ -6,6 +6,8 @@ namespace Dml { + class DmlManagedBufferRegion; + struct Binding { // Non-null if required at the stage where it is used, i.e. Initialization @@ -37,7 +39,7 @@ namespace Dml _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding, gsl::span inputTensors ) const noexcept = 0; - + STDMETHOD(ExecuteOperator)( IDMLCompiledOperator* op, _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding, @@ -64,8 +66,8 @@ namespace Dml STDMETHOD_(D3D12_COMMAND_LIST_TYPE, GetCommandListTypeForQueue)() const noexcept = 0; STDMETHOD_(void, Flush)() const noexcept = 0; - STDMETHOD_(ID3D12Resource*, DecodeResource)(void* allocation) const noexcept = 0; - STDMETHOD(AllocatePooledResource(size_t size, AllocatorRoundingMode roundingMode, ID3D12Resource **d3dResource, IUnknown* *pooledResource)) const noexcept = 0; + STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept = 0; + STDMETHOD(AllocatePooledResource(size_t size, DmlManagedBufferRegion** pooledResource)) const noexcept = 0; STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0; STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 197c62283fba9..0e63f2c5be0f9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -104,20 +104,11 @@ namespace Windows::AI::MachineLearning::Adapter // kernels are registered. void TranslateAllocationDataToAbi( IWinmlExecutionProvider* winmlProvider, - bool isInternalOperator, const ::OrtMemoryInfo& allocInfo, - IUnknown* allocation, + void* opaqueData, IUnknown** abiAllocation) { - if (winmlProvider) - { - winmlProvider->GetABIDataInterface(isInternalOperator, allocation, abiAllocation); - } - else - { - ComPtr tmp = allocation; - *abiAllocation = tmp.Detach(); - } + winmlProvider->GetABIDataInterface(opaqueData, abiAllocation); } // @@ -1143,7 +1134,7 @@ namespace Windows::AI::MachineLearning::Adapter if (operatorGraphDesc->nodesAsOpDesc) { m_graphNodeCreateInfo->nodesAsOperatorDesc = std::vector>(); - for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) + for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) { auto* node = operatorGraphDesc->nodesAsOpDesc[nodeIndex]; assert(node != nullptr); @@ -1154,7 +1145,7 @@ namespace Windows::AI::MachineLearning::Adapter else { m_graphNodeCreateInfo->nodesAsIDMLOperator = std::vector>(); - for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) + for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) { auto* node = operatorGraphDesc->nodesAsIDMLOperator[nodeIndex]; assert(node != nullptr); @@ -1301,7 +1292,6 @@ namespace Windows::AI::MachineLearning::Adapter // kernels (i.e. ID3D12Resource, versus something that tracks the layout). TranslateAllocationDataToAbi( m_winmlExecutionProvider.Get(), - m_internalOperator, m_impl->Location(), m_dataInterfaceOrShadowCopy ? m_dataInterfaceOrShadowCopy.Get() : m_dataInterface.Get(), m_abiDataInterface.GetAddressOf()); @@ -1667,7 +1657,7 @@ namespace Windows::AI::MachineLearning::Adapter *allocId = m_winmlProvider->TryGetPooledAllocationId(allocation.Get(), 0); - TranslateAllocationDataToAbi(m_winmlProvider.Get(), m_internalOperator, alloc->Info(), allocation.Get(), abiAllocation); + TranslateAllocationDataToAbi(m_winmlProvider.Get(), alloc->Info(), allocation.Get(), abiAllocation); if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) { @@ -2307,7 +2297,7 @@ namespace Windows::AI::MachineLearning::Adapter } std::tuple, size_t> UnpackTensor( - const onnx::TensorProto& initializer, + const onnx::TensorProto& initializer, const onnxruntime::Path& modelPath) { std::unique_ptr unpackedTensor; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index 3ae29629efbcd..2d99c8a6dd6df 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -3,6 +3,7 @@ #include "precomp.h" #include "DmlOperator.h" +#include "../DmlManagedBufferRegion.h" namespace Dml { @@ -93,13 +94,14 @@ namespace Dml UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { + ComPtr managedBufferRegion; ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource( static_cast(persistentResourceSize), - AllocatorRoundingMode::Enabled, - m_persistentResource.GetAddressOf(), - m_persistentResourcePoolingUnk.GetAddressOf())); + managedBufferRegion.GetAddressOf())); - m_persistentResourceBinding = DML_BUFFER_BINDING{ m_persistentResource.Get(), 0, persistentResourceSize }; + managedBufferRegion.As(&m_persistentResourcePoolingUnk); + m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); + m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); } std::vector initializationInputBindings(m_kernelInputIndices.size()); @@ -192,13 +194,14 @@ namespace Dml UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { + ComPtr managedBufferRegion; ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource( static_cast(persistentResourceSize), - AllocatorRoundingMode::Enabled, - m_persistentResource.GetAddressOf(), - m_persistentResourcePoolingUnk.GetAddressOf())); + managedBufferRegion.GetAddressOf())); - m_persistentResourceBinding = DML_BUFFER_BINDING{ m_persistentResource.Get(), 0, persistentResourceSize }; + managedBufferRegion.As(&m_persistentResourcePoolingUnk); + m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); + m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); } std::vector initializationInputBindings(m_kernelInputIndices.size()); @@ -229,14 +232,16 @@ namespace Dml if (!m_persistentResource || m_persistentResource->GetDesc().Width < persistentResourceSize) { m_persistentResource = nullptr; + + ComPtr managedBufferRegion; ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource( static_cast(persistentResourceSize), - AllocatorRoundingMode::Enabled, - m_persistentResource.GetAddressOf(), - m_persistentResourcePoolingUnk.GetAddressOf())); - } + managedBufferRegion.GetAddressOf())); - m_persistentResourceBinding = DML_BUFFER_BINDING{ m_persistentResource.Get(), 0, persistentResourceSize }; + managedBufferRegion.As(&m_persistentResourcePoolingUnk); + m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); + m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); + } } ORT_THROW_IF_FAILED(m_executionProvider->InitializeOperator( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index 590dffef488e4..a91886c3b5863 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -105,6 +105,7 @@ namespace Dml gsl::span dst, gsl::span dstSizes, gsl::span src, + gsl::span srcOffsets, gsl::span srcStates) { assert(dst.size() == src.size()); @@ -132,7 +133,7 @@ namespace Dml offset, D3D12_RESOURCE_STATE_COPY_DEST, src[i], - 0, + srcOffsets[i], srcStates[i], dstSizes[i]); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index 9727dc6ac8752..f888f0a55ac48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -27,6 +27,7 @@ namespace Dml gsl::span dst, gsl::span dstSizes, gsl::span src, + gsl::span srcOffsets, gsl::span srcStates); private: diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h index d79b2fb4e7c2a..04b30f75b340e 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h @@ -455,15 +455,11 @@ class MLOperatorTensor // needing to agnostically copy memory. const void* GetByteData() const { - ML_CHECK_BOOL(!IsDataInterface()); - return m_impl->GetData(); } void* GetByteData() { - ML_CHECK_BOOL(!IsDataInterface()); - return m_impl->GetData(); } From 76328becb9e23d528c813553f2b9baa02c523995 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 24 Jan 2023 18:20:02 -0800 Subject: [PATCH 09/76] WIP --- .../inc/IWinmlExecutionProvider.h | 5 --- .../src/BucketizedBufferAllocator.cpp | 12 +++--- .../src/BucketizedBufferAllocator.h | 7 ++-- .../src/ExecutionProvider.cpp | 16 ++------ .../src/ExecutionProvider.h | 7 +--- .../src/MLOperatorAuthorImpl.cpp | 40 +++++-------------- .../src/MLOperatorAuthorImpl.h | 2 - 7 files changed, 26 insertions(+), 63 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index 52f5a104b0379..a56f03e50a9e1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -39,11 +39,6 @@ namespace Windows::AI::MachineLearning::Adapter // the provider's underlying queues. virtual void QueueReference(IUnknown *object) = 0; - virtual void GetShadowCopyIfRequired( - bool isInternalOperator, - IUnknown* data, - IUnknown** dataCopy) const = 0; - virtual void GetABIDataInterface( void* data, IUnknown** abiData) const = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index df12c1567d5be..66405dd5d2989 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -72,12 +72,14 @@ namespace Dml BucketizedBufferAllocator::BucketizedBufferAllocator( ID3D12Device* device, + std::shared_ptr context, ID3D12CommandQueue* queue, const D3D12_HEAP_PROPERTIES& heap_props, D3D12_HEAP_FLAGS heap_flags, D3D12_RESOURCE_FLAGS resource_flags, D3D12_RESOURCE_STATES initial_state) - : device_(device), + : m_device(device), + m_context(context), queue_(queue), heap_properties_(heap_props), heap_flags_(heap_flags), @@ -113,7 +115,7 @@ namespace Dml for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) { - HRESULT create_resource_hr = device_->CreateReservedResource( + HRESULT create_resource_hr = m_device->CreateReservedResource( &resource_desc, states[i], nullptr, @@ -150,7 +152,7 @@ namespace Dml heap_flags_); HRESULT create_heap_hr = - device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); + m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); if (create_heap_hr == E_OUTOFMEMORY) { return absl::nullopt; @@ -215,7 +217,7 @@ namespace Dml allocation.heaps.resize(1); D3D12_HEAP_DESC heap_desc = CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_); - HRESULT create_heap_hr = device_->CreateHeap( + HRESULT create_heap_hr = m_device->CreateHeap( &heap_desc, IID_PPV_ARGS(&allocation.heaps.front())); if (create_heap_hr == E_OUTOFMEMORY) @@ -238,7 +240,7 @@ namespace Dml for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) { - HRESULT create_resource_hr = device_->CreatePlacedResource( + HRESULT create_resource_hr = m_device->CreatePlacedResource( allocation.heaps.front().Get(), 0, &resource_desc, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index f21d174500fcb..b28bdba544766 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -45,6 +45,7 @@ namespace Dml BucketizedBufferAllocator( ID3D12Device* device, + std::shared_ptr context, ID3D12CommandQueue* queue, const D3D12_HEAP_PROPERTIES& heap_props, D3D12_HEAP_FLAGS heap_flags, @@ -107,13 +108,10 @@ namespace Dml friend class AllocationInfo; void FreeResource(void* p, uint64_t resourceId); - ComPtr m_device; - std::vector m_pool; size_t m_currentAllocationId = 0; uint64_t m_currentResourceId = 0; AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled; - std::shared_ptr m_context; std::unique_ptr m_subAllocator; #if _DEBUG @@ -123,7 +121,8 @@ namespace Dml std::mutex mutex_; - Microsoft::WRL::ComPtr device_; + Microsoft::WRL::ComPtr m_device; + std::shared_ptr m_context; Microsoft::WRL::ComPtr queue_; const D3D12_HEAP_PROPERTIES heap_properties_; const D3D12_HEAP_FLAGS heap_flags_; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index ca9080e4fe665..67027e64c5a7b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -184,6 +184,7 @@ namespace Dml m_subAllocator = std::make_shared( m_d3d12Device.Get(), + m_context, queue, CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS, @@ -857,21 +858,12 @@ namespace Dml m_context->QueueReference(object); } - void ExecutionProviderImpl::GetShadowCopyIfRequired( - bool isInternalOperator, - IUnknown* data, - IUnknown** dataCopy) const - { - assert(!m_closed); - - *dataCopy = data; - data->AddRef(); - } - void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const { assert(!m_closed); - *abiData = m_subAllocator->GetAllocationInfo(data)->GetUavResource(); + auto uavResource = m_subAllocator->GetAllocationInfo(data)->GetUavResource(); + uavResource->AddRef(); + *abiData = uavResource; } void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 22a9aed5dfd48..eec8f08848833 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -97,11 +97,6 @@ namespace Dml // IWinmlExecutionProvider methods void QueueReference(IUnknown* object) override; - void GetShadowCopyIfRequired( - bool isInternalOperator, - IUnknown* data, - IUnknown** dataCopy) const override; - void GetABIDataInterface( void* data, IUnknown** abiData) const override; @@ -191,8 +186,8 @@ namespace Dml std::shared_ptr m_context; std::unique_ptr m_uploadHeap; std::unique_ptr m_readbackHeap; - std::shared_ptr m_bfcAllocator; std::shared_ptr m_subAllocator; + std::shared_ptr m_bfcAllocator; std::shared_ptr m_gpuAllocator; std::shared_ptr m_cpuInputAllocator; std::shared_ptr m_cpuOutputAllocator; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 0e63f2c5be0f9..d601a2b3b4025 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1273,39 +1273,21 @@ namespace Windows::AI::MachineLearning::Adapter { if (impl) { + m_tensorData = m_impl->MutableDataRaw(); + if (isDataInterface) { - // We assume that all data handles derive from IUnknown as their first base. - m_dataInterface = static_cast(m_impl->MutableDataRaw()); - - if (m_dataInterface) + if (m_tensorData) { - if (m_winmlExecutionProvider) - { - // The resource may require conversion to the layout expected according to the kernel options. - // This will return either the original object or a shadow copy which uses a different layout. - // This pattern assumes that Lotus is not re-using tensor allocations, so each output is - // a fresh allocation which will not trigger a conversion in the provider. - m_winmlExecutionProvider->GetShadowCopyIfRequired(m_internalOperator, m_dataInterface.Get(), m_dataInterfaceOrShadowCopy.GetAddressOf()); - - // Get the actual object to be returned from the ABI, which varies for internal and external - // kernels (i.e. ID3D12Resource, versus something that tracks the layout). - TranslateAllocationDataToAbi( - m_winmlExecutionProvider.Get(), - m_impl->Location(), - m_dataInterfaceOrShadowCopy ? m_dataInterfaceOrShadowCopy.Get() : m_dataInterface.Get(), - m_abiDataInterface.GetAddressOf()); - } - else - { - m_abiDataInterface = m_dataInterface; - } + // Get the actual object to be returned from the ABI, which varies for internal and external + // kernels (i.e. ID3D12Resource, versus something that tracks the layout). + TranslateAllocationDataToAbi( + m_winmlExecutionProvider.Get(), + m_impl->Location(), + m_tensorData, + m_abiDataInterface.GetAddressOf()); } } - else - { - m_tensorData = m_impl->MutableDataRaw(); - } } } @@ -1383,7 +1365,7 @@ namespace Windows::AI::MachineLearning::Adapter return nullptr; } - return m_isDataInterface ? nullptr : m_tensorData; + return m_tensorData; } void STDMETHODCALLTYPE TensorWrapper::GetDataInterface(IUnknown** dataInterface) noexcept diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index dd1b743587ab5..7e308989791f8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -283,12 +283,10 @@ class TensorWrapper : public WRL::Base, public Closable bool m_internalOperator = false; void* m_tensorData = nullptr; - ComPtr m_dataInterface; bool m_isDataInterface = false; // The returned data may be a converted shadow copy, and the piece of it which // is returned may vary according to kernel registration options. - ComPtr m_dataInterfaceOrShadowCopy; ComPtr m_abiDataInterface; }; From 7bd0983b7b9e8ae174376638cb2e233d4a372581 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 24 Jan 2023 18:26:54 -0800 Subject: [PATCH 10/76] WIP --- .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 67027e64c5a7b..3a28ea6f7b47b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -202,7 +202,7 @@ namespace Dml // Wrap the BFC allocator into our own allocator m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_subAllocator.get()); - m_context->SetAllocator(m_bfcAllocator); + m_context->SetAllocator(m_gpuAllocator); m_uploadHeap = std::make_unique(m_d3d12Device.Get(), m_context); m_readbackHeap = std::make_unique(m_d3d12Device.Get(), m_context); From 0c35fc2f5df93b9ff196d5b6b82ca35a99d5b9c9 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 24 Jan 2023 18:51:19 -0800 Subject: [PATCH 11/76] WIP --- .../DmlExecutionProvider/src/DmlGpuAllocator.h | 17 ++++++++++------- .../src/ExecutionProvider.cpp | 14 +++++++------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index 554a4dca8e550..1d4b35506afcb 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -11,7 +11,7 @@ namespace Dml class DmlGpuAllocator : public onnxruntime::IAllocator { public: - DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, BucketizedBufferAllocator* subAllocator) + DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( "DML", @@ -20,12 +20,15 @@ namespace Dml ) ), m_bfcAllocator(bfcAllocator), - m_subAllocator(subAllocator) {} + m_subAllocator(std::move(subAllocator)) {} - void* Alloc(size_t size_in_bytes) { return m_bfcAllocator->Alloc(size_in_bytes); } - void Free(void* ptr) { m_bfcAllocator->Free(ptr); } - - BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator; } + void* Alloc(size_t size_in_bytes) final { return m_bfcAllocator->Alloc(size_in_bytes); } + void Free(void* ptr) final { m_bfcAllocator->Free(ptr); } + D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); } + ComPtr CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes); } + AllocationInfo* GetAllocationInfo(const void* ptr) { return m_subAllocator->GetAllocationInfo(ptr); } + void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { m_subAllocator->SetDefaultRoundingMode(roundingMode); } + BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator.get(); } private: // This allocator is managed by ORT and should be used to allocate/free memory in order @@ -34,6 +37,6 @@ namespace Dml // This allocator is specific to DML and is used to decode the opaque data returned by the BFC // allocator into objects that DML understands - BucketizedBufferAllocator* m_subAllocator; + std::shared_ptr m_subAllocator; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 3a28ea6f7b47b..85ffbddb84989 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -121,7 +121,7 @@ namespace Dml ORT_TRY { void* opaqueData = m_bfcAllocator->Alloc(size); - auto bufferRegion = m_subAllocator->CreateManagedBufferRegion(opaqueData, size); + auto bufferRegion = m_gpuAllocator->CreateManagedBufferRegion(opaqueData, size); bufferRegion.CopyTo(managedBufferRegion); return S_OK; } @@ -133,7 +133,7 @@ namespace Dml MLOperatorTensor mlOperatorTensor(tensor); void* data = mlOperatorTensor.GetByteData(); auto sizeInBytes = mlOperatorTensor.GetUnalignedTensorByteSize(); - return m_subAllocator->CreateBufferRegion(data, sizeInBytes); + return m_gpuAllocator->CreateBufferRegion(data, sizeInBytes); } ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept @@ -200,7 +200,7 @@ namespace Dml m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo); // Wrap the BFC allocator into our own allocator - m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_subAllocator.get()); + m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_subAllocator); m_context->SetAllocator(m_gpuAllocator); @@ -844,7 +844,7 @@ namespace Dml void ExecutionProviderImpl::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { - m_subAllocator->SetDefaultRoundingMode(roundingMode); + m_gpuAllocator->SetDefaultRoundingMode(roundingMode); } void ExecutionProviderImpl::ReleaseCompletedReferences() @@ -861,21 +861,21 @@ namespace Dml void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const { assert(!m_closed); - auto uavResource = m_subAllocator->GetAllocationInfo(data)->GetUavResource(); + auto uavResource = m_gpuAllocator->GetAllocationInfo(data)->GetUavResource(); uavResource->AddRef(); *abiData = uavResource; } void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const { - auto managedBufferRegion = m_subAllocator->CreateManagedBufferRegion(data, size); + auto managedBufferRegion = m_gpuAllocator->CreateManagedBufferRegion(data, size); ORT_THROW_IF_FAILED(managedBufferRegion.CopyTo(abiData)); } uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator) { assert(!isInternalOperator); - return m_subAllocator->GetAllocationInfo(data)->GetPooledResourceId(); + return m_gpuAllocator->GetAllocationInfo(data)->GetPooledResourceId(); } void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState( From 43c47b99d87831af751e1a367a8ea13dc253f7f4 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 24 Jan 2023 21:27:04 -0800 Subject: [PATCH 12/76] WIP --- .../src/DmlCommandRecorder.cpp | 11 ++-- .../src/DmlCommandRecorder.h | 7 ++- .../src/DmlGpuAllocator.cpp | 53 +++++++++++++++++++ .../src/DmlGpuAllocator.h | 30 +++++------ .../src/ExecutionContext.cpp | 3 +- .../src/ExecutionContext.h | 3 +- .../src/ExecutionProvider.cpp | 12 ++--- 7 files changed, 81 insertions(+), 38 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 22161a6a58cbf..f60d11fcebf4d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -4,7 +4,6 @@ #include "precomp.h" #include "DmlCommandRecorder.h" #include "CommandQueue.h" -#include "BucketizedBufferAllocator.h" #include "absl/cleanup/cleanup.h" using namespace Dml; @@ -23,15 +22,11 @@ DmlCommandRecorder::DmlCommandRecorder( ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder))); } -void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) +void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) { m_allocator = allocator; } -void DmlCommandRecorder::SetSubAllocator(std::weak_ptr subAllocator) -{ - m_subAllocator = subAllocator; -} void DmlCommandRecorder::InitializeOperator( IDMLCompiledOperator* op, @@ -74,7 +69,7 @@ void DmlCommandRecorder::InitializeOperator( } absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); }); - auto subAllocator = m_subAllocator.lock(); + auto subAllocator = m_allocator.lock(); auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize); // Bind the temporary resource. @@ -154,7 +149,7 @@ void DmlCommandRecorder::ExecuteOperator( } absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); }); - auto subAllocator = m_subAllocator.lock(); + auto subAllocator = m_allocator.lock(); auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize); // Bind the temporary resource. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h index 2bf23062a49f7..e442df1f1df6c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h @@ -6,6 +6,7 @@ #include "ICommandRecorder.h" #include "CommandAllocatorRing.h" #include "core/framework/allocator.h" +#include "DmlGpuAllocator.h" namespace Dml { @@ -56,8 +57,7 @@ namespace Dml void Open() final; void CloseAndExecute() final; - void SetAllocator(std::weak_ptr allocator); - void SetSubAllocator(std::weak_ptr allocator); + void SetAllocator(std::weak_ptr allocator); bool HasUnsubmittedWork() override { @@ -84,8 +84,7 @@ namespace Dml ID3D12DescriptorHeap* m_currentDescriptorHeap = nullptr; // The weak pointer avoids a circular reference from context->recorder->allocator->context - std::weak_ptr m_allocator; - std::weak_ptr m_subAllocator; + std::weak_ptr m_allocator; CommandAllocatorRing<2> m_commandAllocatorRing; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp new file mode 100644 index 0000000000000..8e8db740b41de --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -0,0 +1,53 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "precomp.h" +#include "DmlGpuAllocator.h" +#include "core/framework/allocator.h" +#include "BucketizedBufferAllocator.h" + +namespace Dml +{ + DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) + : onnxruntime::IAllocator( + OrtMemoryInfo( + "DML", + OrtAllocatorType::OrtDeviceAllocator, + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) + ) + ), + m_bfcAllocator(bfcAllocator), + m_subAllocator(std::move(subAllocator)) {} + + void* DmlGpuAllocator::Alloc(size_t size_in_bytes) + { + return m_bfcAllocator->Alloc(size_in_bytes); + } + + void DmlGpuAllocator::Free(void* ptr) + { + m_bfcAllocator->Free(ptr); + } + + D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const void* ptr, uint64_t size_in_bytes) + { + return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); + } + + ComPtr DmlGpuAllocator::CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes) + { + return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes); + } + + AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const void* ptr) + { + return m_subAllocator->GetAllocationInfo(ptr); + } + + void DmlGpuAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) + { + m_subAllocator->SetDefaultRoundingMode(roundingMode); + } +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index 1d4b35506afcb..b12c990d44565 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -4,31 +4,25 @@ #pragma once #include "core/framework/allocator.h" -#include "BucketizedBufferAllocator.h" +#include "DmlBufferRegion.h" +#include "DmlManagedBufferRegion.h" namespace Dml { + class BucketizedBufferAllocator; + class AllocationInfo; + class DmlGpuAllocator : public onnxruntime::IAllocator { public: - DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) - : onnxruntime::IAllocator( - OrtMemoryInfo( - "DML", - OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) - ) - ), - m_bfcAllocator(bfcAllocator), - m_subAllocator(std::move(subAllocator)) {} + DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator); - void* Alloc(size_t size_in_bytes) final { return m_bfcAllocator->Alloc(size_in_bytes); } - void Free(void* ptr) final { m_bfcAllocator->Free(ptr); } - D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); } - ComPtr CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes); } - AllocationInfo* GetAllocationInfo(const void* ptr) { return m_subAllocator->GetAllocationInfo(ptr); } - void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { m_subAllocator->SetDefaultRoundingMode(roundingMode); } - BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator.get(); } + void* Alloc(size_t size_in_bytes) final; + void Free(void* ptr) final; + D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes); + ComPtr CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes); + AllocationInfo* GetAllocationInfo(const void* ptr); + void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); private: // This allocator is managed by ORT and should be used to allocate/free memory in order diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index c3415c4b9ea49..6a30e6cd1ad56 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -4,6 +4,7 @@ #include "precomp.h" #include "ExecutionContext.h" #include "CommandQueue.h" +#include "DmlGpuAllocator.h" namespace Dml { @@ -18,7 +19,7 @@ namespace Dml ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf()))); } - void ExecutionContext::SetAllocator(std::weak_ptr allocator) + void ExecutionContext::SetAllocator(std::weak_ptr allocator) { m_dmlRecorder.SetAllocator(allocator); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h index 6625ae83ffd1e..6e2d205f48ebd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h @@ -10,6 +10,7 @@ namespace Dml { class CommandQueue; + class DmlGpuAllocator; // Asynchronously performs GPU work, and automatically manages command list recording and submission to queues. // Work submitted to the ExecutionContext is typically recorded onto a command list and may not immediately begin @@ -24,7 +25,7 @@ namespace Dml IDMLDevice* dmlDevice, ID3D12CommandQueue* queue); - void SetAllocator(std::weak_ptr allocator); + void SetAllocator(std::weak_ptr allocator); // Waits for flushed work, discards unflushed work, and discards associated references to // prevent circular references. Must be the last call on the object before destruction. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 85ffbddb84989..1f6aafda1cc9b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -120,7 +120,7 @@ namespace Dml { ORT_TRY { - void* opaqueData = m_bfcAllocator->Alloc(size); + void* opaqueData = m_gpuAllocator->Alloc(size); auto bufferRegion = m_gpuAllocator->CreateManagedBufferRegion(opaqueData, size); bufferRegion.CopyTo(managedBufferRegion); return S_OK; @@ -182,7 +182,7 @@ namespace Dml m_context = std::make_shared(m_d3d12Device.Get(), m_dmlDevice.Get(), queue); - m_subAllocator = std::make_shared( + auto subAllocator = std::make_shared( m_d3d12Device.Get(), m_context, queue, @@ -193,14 +193,14 @@ namespace Dml // Create a BFC allocator that encapsulates our allocator onnxruntime::AllocatorCreationInfo memoryInfo( - [this](OrtDevice::DeviceId id) { - return std::make_unique(m_subAllocator.get()); + [subAllocator](OrtDevice::DeviceId id) { + return std::make_unique(subAllocator.get()); }); m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo); // Wrap the BFC allocator into our own allocator - m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_subAllocator); + m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), subAllocator); m_context->SetAllocator(m_gpuAllocator); @@ -1001,7 +1001,7 @@ namespace Dml ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr) { Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->GetSubAllocator()->GetAllocationInfo(ptr)->GetUavResource(); + return pAllocationInfo->GetAllocationInfo(ptr)->GetUavResource(); } void FlushContext(onnxruntime::IExecutionProvider* provider) From d0eb5da576ae4fb24241115a80cba7176bfc7c9e Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 24 Jan 2023 22:11:22 -0800 Subject: [PATCH 13/76] WIP --- .../src/BucketizedBufferAllocator.cpp | 27 ------------------- .../src/BucketizedBufferAllocator.h | 1 - .../src/DmlAllocationInfo.cpp | 4 --- .../src/DmlBfcAllocator.h | 6 ++--- .../src/DmlGpuAllocator.cpp | 2 +- .../src/ExecutionProvider.cpp | 2 +- 6 files changed, 5 insertions(+), 37 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 66405dd5d2989..d3dbe19599bb9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -328,33 +328,6 @@ namespace Dml allocations_by_id_.erase(it); } - void BucketizedBufferAllocator::FreeResource(void* p, uint64_t pooledResourceId) - { - AllocationInfo *allocInfo = static_cast(p); - - assert(allocInfo != nullptr); // Can't free nullptr - - if (allocInfo->GetOwner() != this) - { - // This allocation doesn't belong to this allocator! - ORT_THROW_HR(E_INVALIDARG); - } - - // Free the underlying allocation once queued work has completed. -#ifdef _GAMING_XBOX - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get()); -#else - m_context->QueueReference(allocInfo->DetachResourceWrapper().Get()); -#endif - - #if _DEBUG - assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo); - m_outstandingAllocationsById.erase(allocInfo->GetId()); - #endif - - // The allocation info is already destructing at this point - } - absl::optional BucketizedBufferAllocator::TryReserveAllocationID() { // The mutex must already be held diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index b28bdba544766..16fc28049a583 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -106,7 +106,6 @@ namespace Dml static uint64_t GetBucketSizeFromIndex(gsl::index index); friend class AllocationInfo; - void FreeResource(void* p, uint64_t resourceId); std::vector m_pool; size_t m_currentAllocationId = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp index 044e9e854d700..9af6933cd3ed7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp @@ -10,10 +10,6 @@ namespace Dml AllocationInfo::~AllocationInfo() { - if (m_owner) - { - m_owner->FreeResource(this, m_pooledResourceId); - } } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h index 458a65e63c0c4..f43aa769af0a9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h @@ -11,7 +11,7 @@ namespace Dml class DmlBfcAllocator : public onnxruntime::IAllocator { public: - DmlBfcAllocator(BucketizedBufferAllocator* subAllocator) + DmlBfcAllocator(std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( "DML", @@ -19,11 +19,11 @@ namespace Dml OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) ) ), - m_subAllocator(subAllocator) {} + m_subAllocator(std::move(subAllocator)) {} void* Alloc(size_t size_in_bytes) { return m_subAllocator->Alloc(size_in_bytes); } void Free(void* ptr) { m_subAllocator->Free(ptr); } private: - BucketizedBufferAllocator* m_subAllocator; + std::shared_ptr m_subAllocator; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 8e8db740b41de..44df1c79aacbe 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -19,7 +19,7 @@ namespace Dml ) ), m_bfcAllocator(bfcAllocator), - m_subAllocator(std::move(subAllocator)) {} + m_subAllocator(subAllocator) {} void* DmlGpuAllocator::Alloc(size_t size_in_bytes) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 1f6aafda1cc9b..a9046f91c76c8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -194,7 +194,7 @@ namespace Dml // Create a BFC allocator that encapsulates our allocator onnxruntime::AllocatorCreationInfo memoryInfo( [subAllocator](OrtDevice::DeviceId id) { - return std::make_unique(subAllocator.get()); + return std::make_unique(subAllocator); }); m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo); From 3385d20a7027023be873e70ad653146256682649 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 26 Jan 2023 11:20:18 -0800 Subject: [PATCH 14/76] Add buffer region size alignment --- .../DmlExecutionProvider/src/BucketizedBufferAllocator.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index d3dbe19599bb9..8438393544740 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -377,6 +377,11 @@ namespace Dml auto it = allocations_by_id_.find(tagged_ptr.allocation_id); ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); + // Make sure that we are aligned to 4 bytes to satisfy DML's requirements + constexpr uint64_t DML_ALIGNMENT = 4; + size_in_bytes = + (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; + return D3D12BufferRegion( tagged_ptr.offset, size_in_bytes, From 7e5622d29ec62e6498d2b46c5dd139ea47e802ea Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 26 Jan 2023 15:14:00 -0800 Subject: [PATCH 15/76] WIP --- .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 4 +++- .../DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index a9046f91c76c8..363f5897c98a9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -494,7 +494,9 @@ namespace Dml ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS : D3D12_RESOURCE_STATE_COPY_DEST; - m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes); + const uint64_t srcOffset = srcBufferRegion.Offset(); + const uint64_t dstOffset = dstBufferRegion.Offset(); + m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes); } else { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index af983b26772d9..002a8f9192b31 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -50,13 +50,13 @@ class DmlOperatorCopy : public DmlOperator MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); // Avoid self copying. - if (inputTensor.GetDataInterface().Get() != outputTensor.GetDataInterface().Get()) - { + // if (inputTensor.GetDataInterface().Get() != outputTensor.GetDataInterface().Get()) + // { // Copy elements from input tensor to output tensor. ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( outputTensor.GetInterface().Get(), inputTensor.GetInterface().Get())); - } + // } } }; From e6897c50e80038f1b610c9c8932af9e6d4a6078b Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 26 Jan 2023 15:14:41 -0800 Subject: [PATCH 16/76] WIP --- .../src/Operators/DmlOperatorCopy.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index 002a8f9192b31..4ca51633d23e7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -49,14 +49,10 @@ class DmlOperatorCopy : public DmlOperator // Reshape the output tensor. MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); - // Avoid self copying. - // if (inputTensor.GetDataInterface().Get() != outputTensor.GetDataInterface().Get()) - // { - // Copy elements from input tensor to output tensor. - ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( - outputTensor.GetInterface().Get(), - inputTensor.GetInterface().Get())); - // } + // Copy elements from input tensor to output tensor. + ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( + outputTensor.GetInterface().Get(), + inputTensor.GetInterface().Get())); } }; From 2064baa0888443a49c5e6905ae38e76f8f8c02c6 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 27 Jan 2023 02:24:52 -0800 Subject: [PATCH 17/76] WIP --- .../inc/IWinmlExecutionProvider.h | 9 +- .../src/BucketizedBufferAllocator.cpp | 44 +++---- .../src/BucketizedBufferAllocator.h | 5 +- .../src/DmlAllocationInfo.cpp | 4 + .../DmlExecutionProvider/src/DmlBuffer.cpp | 76 +++++++++++ .../dml/DmlExecutionProvider/src/DmlBuffer.h | 47 +++++++ ...DmlBufferRegion.cc => DmlBufferRegion.cpp} | 0 .../src/DmlCommandRecorder.cpp | 26 +--- .../src/DmlGpuAllocator.cpp | 11 +- .../src/DmlGpuAllocator.h | 4 +- .../src/DmlGraphFusionHelper.cpp | 7 +- .../src/DmlManagedBuffer.h | 20 +++ .../src/DmlManagedBufferRegion.h | 26 ---- .../src/ExecutionContext.h | 2 + .../src/ExecutionProvider.cpp | 31 +---- .../src/ExecutionProvider.h | 18 +-- .../src/FusedGraphKernel.cpp | 43 ++----- .../src/IExecutionProvider.h | 4 +- .../src/MLOperatorAuthorImpl.cpp | 120 +++--------------- .../src/MLOperatorAuthorImpl.h | 11 +- .../src/Operators/DmlDFT.h | 32 ----- .../src/Operators/DmlOperator.cpp | 39 ++---- .../src/Operators/DmlOperator.h | 4 +- .../src/dml_buffer_region.h | 97 ++++++++++++++ 24 files changed, 342 insertions(+), 338 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{DmlBufferRegion.cc => DmlBufferRegion.cpp} (100%) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index a56f03e50a9e1..ccde56e5d712d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -39,14 +39,7 @@ namespace Windows::AI::MachineLearning::Adapter // the provider's underlying queues. virtual void QueueReference(IUnknown *object) = 0; - virtual void GetABIDataInterface( - void* data, - IUnknown** abiData) const = 0; - - virtual void GetManagedBufferRegion( - void* data, - uint64_t size, - Dml::DmlManagedBufferRegion** abiData) const = 0; + virtual ID3D12Resource* GetABIDataInterface(void* data) const = 0; virtual uint64_t TryGetPooledAllocationId( void* data, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 8438393544740..b0ddd29e6bf46 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -7,7 +7,6 @@ #include "BucketizedBufferAllocator.h" #include "DmlReservedResourceWrapper.h" #include "DmlBufferRegion.h" -#include "DmlManagedBufferRegion.h" namespace Dml { @@ -328,6 +327,24 @@ namespace Dml allocations_by_id_.erase(it); } + void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId) + { + // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already + // close at this point if the application is winding down. + if (!m_context->Closed()) + { + assert(allocInfo != nullptr); // Can't free nullptr + + if (allocInfo->GetOwner() != this) + { + // This allocation doesn't belong to this allocator! + ORT_THROW_HR(E_INVALIDARG); + } + + m_context->QueueReference(allocInfo); + } + } + absl::optional BucketizedBufferAllocator::TryReserveAllocationID() { // The mutex must already be held @@ -390,31 +407,6 @@ namespace Dml it->second->GetCopyDstResource()); } - ComPtr BucketizedBufferAllocator::CreateManagedBufferRegion( - const void* ptr, - uint64_t size_in_bytes) - { - ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); - - TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); - - // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); - - // Find the allocation corresponding to this pointer - auto it = allocations_by_id_.find(tagged_ptr.allocation_id); - ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); - - D3D12BufferRegion bufferRegion( - tagged_ptr.offset, - size_in_bytes, - it->second->GetUavResource(), - it->second->GetCopySrcResource(), - it->second->GetCopyDstResource()); - - return wil::MakeOrThrow(it->second, std::move(bufferRegion)); - } - AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const void* ptr) { ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 16fc28049a583..f2c09dfa0cfc4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -63,14 +63,11 @@ namespace Dml const void* ptr, uint64_t size_in_bytes); - ComPtr CreateManagedBufferRegion( - const void* ptr, - uint64_t size_in_bytes); - AllocationInfo* GetAllocationInfo(const void* ptr); void* Alloc(size_t size_in_bytes); void Free(void* ptr); + void FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId); uint64_t ComputeRequiredSize(size_t size); bool TilingEnabled() const { return tiling_enabled_; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp index 9af6933cd3ed7..044e9e854d700 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp @@ -10,6 +10,10 @@ namespace Dml AllocationInfo::~AllocationInfo() { + if (m_owner) + { + m_owner->FreeResource(this, m_pooledResourceId); + } } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp new file mode 100644 index 0000000000000..6f587261553e6 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" +#include "DmlBuffer.h" +#include "DmlGpuAllocator.h" + +namespace Dml +{ + +/*explicit*/ DmlBuffer::DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes) + : allocator_(allocator) +{ + m_opaqueData = allocator_->Alloc(size_in_bytes); + ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr); + + buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes); +} + +DmlBuffer::~DmlBuffer() +{ + if (m_opaqueData != nullptr) + { + allocator_->Free(m_opaqueData); + } +} + +DmlBuffer::DmlBuffer(DmlBuffer&& other) +{ + m_opaqueData = other.m_opaqueData; + allocator_ = other.allocator_; + buffer_region_ = std::move(other.buffer_region_); + other.m_opaqueData = nullptr; +} + +DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) +{ + m_opaqueData = other.m_opaqueData; + allocator_ = other.allocator_; + buffer_region_ = std::move(other.buffer_region_); + other.m_opaqueData = nullptr; + return *this; +} + +ID3D12Resource* DmlBuffer::ResourceInUavState() const +{ + return buffer_region_.ResourceInUavState(); +} + +ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const +{ + return buffer_region_.ResourceInCopySrcState(); +} + +ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const +{ + return buffer_region_.ResourceInCopyDstState(); +} + +uint64_t DmlBuffer::Offset() const +{ + return buffer_region_ ? buffer_region_.Offset() : 0; +} + +uint64_t DmlBuffer::SizeInBytes() const +{ + return buffer_region_ ? buffer_region_.SizeInBytes() : 0; +} + +DML_BUFFER_BINDING DmlBuffer::GetBufferBinding() const +{ + return buffer_region_ ? buffer_region_.GetBufferBinding() + : DML_BUFFER_BINDING{}; +} + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h new file mode 100644 index 0000000000000..b98ae727e1a65 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "DmlBufferRegion.h" + +namespace Dml +{ + +class DmlGpuAllocator; +class OpKernelContext; + +// Owns a D3D12 default heap buffer allocated using the DML device's +// allocator. This is essentially a convenience wrapper over a device memory +// allocation as well as the buffer region that spans it. When this object is +// destructed, the device memory is freed to the allocator. +class DmlBuffer +{ + public: + explicit DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes); + ~DmlBuffer(); + + // Move-only + DmlBuffer(const DmlBuffer&) = delete; + DmlBuffer& operator=(const DmlBuffer&) = delete; + DmlBuffer(DmlBuffer&&); + DmlBuffer& operator=(DmlBuffer&&); + + ID3D12Resource* ResourceInUavState() const; + ID3D12Resource* ResourceInCopySrcState() const; + ID3D12Resource* ResourceInCopyDstState() const; + uint64_t Offset() const; + uint64_t SizeInBytes() const; + const D3D12BufferRegion& Region() const { return buffer_region_; } + + DML_BUFFER_BINDING GetBufferBinding() const; + + explicit operator bool() const { return !!buffer_region_; } + + private: + DmlGpuAllocator* allocator_; + D3D12BufferRegion buffer_region_; + void* m_opaqueData; +}; + +} // namespace tfdml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp similarity index 100% rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index f60d11fcebf4d..af625334b7720 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -4,7 +4,6 @@ #include "precomp.h" #include "DmlCommandRecorder.h" #include "CommandQueue.h" -#include "absl/cleanup/cleanup.h" using namespace Dml; @@ -62,21 +61,12 @@ void DmlCommandRecorder::InitializeOperator( // Allocate and immediately free a temporary buffer. The buffer resource will still be // alive (managed by the pool); freeing allows the resource to be shared with other operators. - void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize)); - if (!tempResourceHandle) - { - ORT_THROW_HR(E_OUTOFMEMORY); - } - absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); }); - - auto subAllocator = m_allocator.lock(); - auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize); + auto buffer = allocator->AllocateDefaultBuffer(temporaryResourceSize); // Bind the temporary resource. - DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding(); + DML_BUFFER_BINDING bufferBinding = buffer.GetBufferBinding(); DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding }; bindingTable->BindTemporaryResource(&bindingDesc); - allocator->Free(tempResourceHandle); } // Bind inputs, if provided. @@ -142,18 +132,10 @@ void DmlCommandRecorder::ExecuteOperator( // Allocate and immediately free a temporary buffer. The buffer resource will still be // alive (managed by the pool); freeing allows the resource to be shared with other operators. - void* tempResourceHandle = allocator->Alloc(static_cast(temporaryResourceSize)); - if (!tempResourceHandle) - { - ORT_THROW_HR(E_OUTOFMEMORY); - } - absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); }); - - auto subAllocator = m_allocator.lock(); - auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize); + auto buffer = allocator->AllocateDefaultBuffer(temporaryResourceSize); // Bind the temporary resource. - DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding(); + DML_BUFFER_BINDING bufferBinding = buffer.GetBufferBinding(); DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding }; bindingTable->BindTemporaryResource(&bindingDesc); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 44df1c79aacbe..13e0d8dfe96f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -36,11 +36,6 @@ namespace Dml return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); } - ComPtr DmlGpuAllocator::CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes) - { - return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes); - } - AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const void* ptr) { return m_subAllocator->GetAllocationInfo(ptr); @@ -50,4 +45,10 @@ namespace Dml { m_subAllocator->SetDefaultRoundingMode(roundingMode); } + + DmlBuffer DmlGpuAllocator::AllocateDefaultBuffer(uint64_t num_bytes) + { + return DmlBuffer(this, num_bytes); + } + } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index b12c990d44565..5ef9ea855753f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -5,7 +5,7 @@ #include "core/framework/allocator.h" #include "DmlBufferRegion.h" -#include "DmlManagedBufferRegion.h" +#include "DmlBuffer.h" namespace Dml { @@ -20,9 +20,9 @@ namespace Dml void* Alloc(size_t size_in_bytes) final; void Free(void* ptr) final; D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes); - ComPtr CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes); AllocationInfo* GetAllocationInfo(const void* ptr); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); + DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes); private: // This allocator is managed by ORT and should be used to allocate/free memory in order diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index 890c5aa1ae384..ffd388f91cace 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -95,12 +95,11 @@ namespace DmlGraphFusionHelper uint64_t* allocId) { void* opaqueData = const_cast(tensor->DataRaw()); - Microsoft::WRL::ComPtr resourceUnk; - winmlProvider->GetABIDataInterface(opaqueData, &resourceUnk); + ID3D12Resource* abiDataInterface = winmlProvider->GetABIDataInterface(opaqueData); + abiDataInterface->AddRef(); *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0); - - ORT_THROW_IF_FAILED(resourceUnk->QueryInterface(resource)); + *resource = abiDataInterface; } void ProcessInputData( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h new file mode 100644 index 0000000000000..ced81af68e92e --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "DmlBuffer.h" + +namespace Dml +{ + // Light wrapper around DmlBuffer used with CommandQueue::QueueReference to keep a reference on the buffer until GPU work is completed + class DmlManagedBuffer : public Microsoft::WRL::RuntimeClass, IUnknown> + { + public: + DmlManagedBuffer(DmlBuffer&& buffer) : m_buffer(std::move(buffer)) {} + uint64_t SizeInBytes() const { return m_buffer.SizeInBytes(); } + + private: + DmlBuffer m_buffer; + }; +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h deleted file mode 100644 index de39f0890f998..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "DmlBufferRegion.h" -#include "DmlAllocationInfo.h" - -namespace Dml -{ - class DmlManagedBufferRegion : public Microsoft::WRL::RuntimeClass, IUnknown> - { - public: - DmlManagedBufferRegion(Microsoft::WRL::ComPtr allocation, D3D12BufferRegion&& bufferRegion) - : m_allocation(std::move(allocation)), - m_bufferRegion(std::move(bufferRegion)) - { - } - - const D3D12BufferRegion& GetBufferRegion() const { return m_bufferRegion; } - - private: - Microsoft::WRL::ComPtr m_allocation; - D3D12BufferRegion m_bufferRegion; - }; -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h index 6e2d205f48ebd..e4ef79081ad14 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h @@ -87,6 +87,8 @@ namespace Dml D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const; + bool Closed() const { return m_closed; } + private: ComPtr m_d3dDevice; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 363f5897c98a9..20f8feed12311 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -20,9 +20,9 @@ #include "core/framework/fallback_cpu_capability.h" #include "DmlCommittedResourceWrapper.h" #include "DmlBufferRegion.h" -#include "DmlManagedBufferRegion.h" #include "DmlBfcAllocator.h" #include "DmlGpuAllocator.h" +#include "DmlBuffer.h" #ifdef ERROR #undef ERROR @@ -113,19 +113,9 @@ namespace Dml m_context->GetCurrentCompletionEvent().WaitForSignal(); } - HRESULT __stdcall ExecutionProviderImpl::AllocatePooledResource( - size_t size, - DmlManagedBufferRegion** managedBufferRegion - ) const noexcept + DmlBuffer ExecutionProviderImpl::AllocatePooledResource(size_t size) const { - ORT_TRY - { - void* opaqueData = m_gpuAllocator->Alloc(size); - auto bufferRegion = m_gpuAllocator->CreateManagedBufferRegion(opaqueData, size); - bufferRegion.CopyTo(managedBufferRegion); - return S_OK; - } - ORT_CATCH_RETURN + return m_gpuAllocator->AllocateDefaultBuffer(size); } D3D12BufferRegion ExecutionProviderImpl::GetBufferForTensor(IMLOperatorTensor* tensor) const @@ -860,18 +850,9 @@ namespace Dml m_context->QueueReference(object); } - void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const - { - assert(!m_closed); - auto uavResource = m_gpuAllocator->GetAllocationInfo(data)->GetUavResource(); - uavResource->AddRef(); - *abiData = uavResource; - } - - void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const + ID3D12Resource* ExecutionProviderImpl::GetABIDataInterface(void* data) const { - auto managedBufferRegion = m_gpuAllocator->CreateManagedBufferRegion(data, size); - ORT_THROW_IF_FAILED(managedBufferRegion.CopyTo(abiData)); + return m_gpuAllocator->GetAllocationInfo(data)->GetUavResource(); } uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator) @@ -965,7 +946,7 @@ namespace Dml std::shared_ptr ExecutionProviderImpl::GetGpuAllocator() { - return m_bfcAllocator; + return m_gpuAllocator; } std::shared_ptr ExecutionProviderImpl::GetCpuInputAllocator() diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index eec8f08848833..20cb307b1cdb4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -6,6 +6,7 @@ #include "GraphTransformer.h" #include "core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h" #include "DmlBufferRegion.h" +#include "DmlBuffer.h" #include #include @@ -26,7 +27,6 @@ namespace Dml class BucketizedBufferAllocator; class DmlCpuAllocator; class ExecutionProvider; - class DmlManagedBufferRegion; class DmlGpuAllocator; class ExecutionProviderImpl : public WRL::BaseGetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { - ComPtr managedBufferRegion; - ORT_THROW_IF_FAILED(m_provider->AllocatePooledResource( - static_cast(persistentResourceSize), - managedBufferRegion.GetAddressOf())); - - managedBufferRegion.As(&m_persistentResourceAllocatorUnk); - m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); - m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); + auto buffer = m_provider->AllocatePooledResource(persistentResourceSize); + m_persistentResourceBinding = buffer.GetBufferBinding(); + m_managedPersistentBuffer = wil::MakeOrThrow(std::move(buffer)); + m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get()); } ORT_THROW_IF_FAILED(m_provider->InitializeOperator( @@ -81,7 +77,6 @@ namespace Dml // Queue references to objects which must be kept alive until resulting GPU work completes m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get()); - m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get()); std::for_each( initializeResourceRefs.begin(), @@ -145,7 +140,7 @@ namespace Dml // Queue references to objects which must be kept alive until resulting GPU work completes m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get()); - m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get()); + m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get()); } else { @@ -359,25 +354,10 @@ namespace Dml if (execBindingProps.TemporaryResourceSize > 0) { - // Allocate temporary data which will automatically be freed when the GPU work - // which is scheduled up to the point that this method returns has completed. - ComPtr tempAlloc; - uint64_t tempAllocId = 0; - ORT_THROW_IF_FAILED(contextWrapper.AllocateTemporaryData(static_cast(execBindingProps.TemporaryResourceSize), tempAlloc.GetAddressOf(), &tempAllocId)); - - ComPtr managedBufferRegion; - m_winmlProvider->GetManagedBufferRegion(tempAlloc.Get(), execBindingProps.TemporaryResourceSize, &managedBufferRegion); - - // Bind the temporary resource. - DML_BUFFER_BINDING tempBufferBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); + auto buffer = m_provider->AllocatePooledResource(execBindingProps.TemporaryResourceSize); + DML_BUFFER_BINDING tempBufferBinding = buffer.GetBufferBinding(); DML_BINDING_DESC tempBindingDesc = { DML_BINDING_TYPE_BUFFER, &tempBufferBinding }; - - if (!tempAllocId || m_tempBindingAllocId != tempAllocId) - { - m_bindingTable->BindTemporaryResource(&tempBindingDesc); - } - - m_tempBindingAllocId = tempAllocId; + m_bindingTable->BindTemporaryResource(&tempBindingDesc); } // Execute the command list and if it succeeds, update the fence value at which this command may be @@ -401,7 +381,7 @@ namespace Dml m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(m_graphicsCommandList).Get()); m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(m_heap).Get()); m_winmlProvider->QueueReference(m_bindingTable.Get()); - m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get()); + m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get()); } ComPtr m_compiledExecutionPlanOperator; @@ -418,12 +398,11 @@ namespace Dml ComPtr m_bindingTable; std::optional m_persistentResourceBinding; ComPtr m_persistentResource; - ComPtr m_persistentResourceAllocatorUnk; // Controls when the persistent resource is returned to the allocator + ComPtr m_managedPersistentBuffer; // Bindings from previous executions of a re-used command list mutable std::vector m_inputBindingAllocIds; mutable std::vector m_outputBindingAllocIds; - mutable uint64_t m_tempBindingAllocId = 0; // Fence tracking the status of the command list's last execution, and whether its descriptor heap // can safely be updated. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h index 4bef0652763a9..61cd34339f04a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h @@ -3,6 +3,7 @@ #pragma once #include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h" +#include "DmlBuffer.h" namespace Dml { @@ -67,9 +68,10 @@ namespace Dml STDMETHOD_(void, Flush)() const noexcept = 0; STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept = 0; - STDMETHOD(AllocatePooledResource(size_t size, DmlManagedBufferRegion** pooledResource)) const noexcept = 0; STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0; STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0; + + virtual DmlBuffer AllocatePooledResource(size_t size) const = 0; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index d601a2b3b4025..55a33cc8513b5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -10,6 +10,7 @@ #include "MLOperatorAuthorImpl.h" #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h" +#include "core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h" using namespace Microsoft::WRL; @@ -102,13 +103,9 @@ namespace Windows::AI::MachineLearning::Adapter // Translate the data object stored in a tensor to the type which will be returned through // the ABI. The translation is determined by the provider and based on options with which the // kernels are registered. - void TranslateAllocationDataToAbi( - IWinmlExecutionProvider* winmlProvider, - const ::OrtMemoryInfo& allocInfo, - void* opaqueData, - IUnknown** abiAllocation) + ID3D12Resource* TranslateAllocationDataToAbi(IWinmlExecutionProvider* winmlProvider, void* opaqueData) { - winmlProvider->GetABIDataInterface(opaqueData, abiAllocation); + return winmlProvider->GetABIDataInterface(opaqueData); } // @@ -1281,11 +1278,7 @@ namespace Windows::AI::MachineLearning::Adapter { // Get the actual object to be returned from the ABI, which varies for internal and external // kernels (i.e. ID3D12Resource, versus something that tracks the layout). - TranslateAllocationDataToAbi( - m_winmlExecutionProvider.Get(), - m_impl->Location(), - m_tensorData, - m_abiDataInterface.GetAddressOf()); + m_abiDataInterface = TranslateAllocationDataToAbi(m_winmlExecutionProvider.Get(), m_tensorData); } } } @@ -1377,55 +1370,8 @@ namespace Windows::AI::MachineLearning::Adapter } else { - m_abiDataInterface.CopyTo(dataInterface); - } - } - - void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp) - { - if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) - { - std::vector resourcesToTransition; - resourcesToTransition.reserve(m_inputTensors.size() + m_outputTensors.size() + m_temporaryAllocations.size()); - - for (uint32_t i = 0; i < m_inputTensors.size(); ++i) - { - ComPtr tensor; - ORT_THROW_IF_FAILED(GetInputTensor(i, tensor.GetAddressOf())); - - if (tensor) - { - ComPtr resource; - tensor->GetDataInterface(resource.GetAddressOf()); - if (resource) - { - resourcesToTransition.push_back(resource.Get()); - } - } - } - - for (uint32_t i = 0; i < m_outputTensors.size(); ++i) - { - ComPtr tensor; - ORT_THROW_IF_FAILED(GetOutputTensor(i, tensor.GetAddressOf())); - - ComPtr resource; - tensor->GetDataInterface(resource.GetAddressOf()); - if (resource) - { - resourcesToTransition.push_back(resource.Get()); - } - } - - for (auto& tempAlloc : m_temporaryAbiAllocations) - { - resourcesToTransition.push_back(tempAlloc.Get()); - } - - m_winmlProvider->TransitionResourcesForOperator( - isBeforeOp, - gsl::narrow_cast(resourcesToTransition.size()), - resourcesToTransition.data()); + m_abiDataInterface->AddRef(); + *dataInterface = m_abiDataInterface; } } @@ -1457,8 +1403,6 @@ namespace Windows::AI::MachineLearning::Adapter { m_winmlProvider->GetABIExecutionInterfaceAndInvalidateState(isInternalOperator, m_abiExecutionObject.ReleaseAndGetAddressOf()); } - - TransitionResourcesForOperatorIfRequired(true); } } @@ -1471,18 +1415,12 @@ namespace Windows::AI::MachineLearning::Adapter { if (m_winmlProvider) { - m_temporaryAllocations.clear(); - m_temporaryAbiAllocations.clear(); + m_temporaryBuffers.clear(); } } void OpKernelContextWrapper::Close() { - if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) - { - TransitionResourcesForOperatorIfRequired(false); - } - for (auto& tensor : m_inputTensors) { if (tensor) @@ -1610,16 +1548,6 @@ namespace Windows::AI::MachineLearning::Adapter } HRESULT STDMETHODCALLTYPE OpKernelContextWrapper::AllocateTemporaryData(size_t size, IUnknown** abiAllocation) const - { - ORT_TRY - { - uint64_t allocId; - return AllocateTemporaryData(size, abiAllocation, &allocId); - } - ORT_CATCH_RETURN - } - - HRESULT STDMETHODCALLTYPE OpKernelContextWrapper::AllocateTemporaryData(size_t size, IUnknown** abiAllocation, uint64_t* allocId) const { ORT_TRY { @@ -1634,21 +1562,13 @@ namespace Windows::AI::MachineLearning::Adapter return E_FAIL; } - ComPtr allocation; - allocation.Attach(static_cast(alloc->Alloc(size))); - - *allocId = m_winmlProvider->TryGetPooledAllocationId(allocation.Get(), 0); - - TranslateAllocationDataToAbi(m_winmlProvider.Get(), alloc->Info(), allocation.Get(), abiAllocation); - - if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) - { - m_winmlProvider->TransitionResourcesForOperator(true, 1, abiAllocation); - } + auto dml_gpu_allocator = static_cast(alloc.get()); + auto buffer = dml_gpu_allocator->AllocateDefaultBuffer(size); + buffer.ResourceInUavState()->AddRef(); + *abiAllocation = buffer.ResourceInUavState(); // Ensure the allocation is freed and transitioned when the context destructs - m_temporaryAllocations.push_back(allocation); - m_temporaryAbiAllocations.push_back(*abiAllocation); + m_temporaryBuffers.push_back(std::move(buffer)); return S_OK; } @@ -1953,14 +1873,16 @@ namespace Windows::AI::MachineLearning::Adapter } } - ComPtr kernelContextWrapper = wil::MakeOrThrow( - context, - Info().GetExecutionProvider(), - m_internalOperator, - m_requiresOutputShapesAtCreation ? &m_inferredOutputShapes : nullptr); + { + ComPtr kernelContextWrapper = wil::MakeOrThrow( + context, + Info().GetExecutionProvider(), + m_internalOperator, + m_requiresOutputShapesAtCreation ? &m_inferredOutputShapes : nullptr); - ORT_THROW_IF_FAILED(m_kernel->Compute(kernelContextWrapper.Get())); - kernelContextWrapper->Close(); + ORT_THROW_IF_FAILED(m_kernel->Compute(kernelContextWrapper.Get())); + kernelContextWrapper->Close(); + } // Ensure that scheduled work, if any, is completed before freeing the kernel if the execution // provider requires this. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 7e308989791f8..31f7e3fbeee8b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -9,6 +9,7 @@ #include "core/framework/tensorprotoutils.h" #include #include +#include "core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h" interface IDMLOperator; @@ -285,10 +286,7 @@ class TensorWrapper : public WRL::Base, public Closable void* m_tensorData = nullptr; bool m_isDataInterface = false; - // The returned data may be a converted shadow copy, and the piece of it which - // is returned may vary according to kernel registration options. - ComPtr m_abiDataInterface; - + ID3D12Resource* m_abiDataInterface; }; class OnnxTensorWrapper : public WRL::Base, public Closable @@ -449,9 +447,7 @@ class OpKernelContextWrapper : public WRL::Base, publi HRESULT STDMETHODCALLTYPE GetInputTensor(uint32_t inputIndex, IMLOperatorTensor** tensor) const noexcept override; HRESULT STDMETHODCALLTYPE GetOutputTensor(uint32_t outputIndex, IMLOperatorTensor** tensor) noexcept override; HRESULT STDMETHODCALLTYPE GetOutputTensor(uint32_t outputIndex, uint32_t dimensions, const uint32_t* dimensionSizes, IMLOperatorTensor** tensor) noexcept override; - HRESULT STDMETHODCALLTYPE AllocateTemporaryData(size_t size, IUnknown** data) const; - HRESULT STDMETHODCALLTYPE AllocateTemporaryData(size_t size, IUnknown** data, uint64_t* allocId) const; void STDMETHODCALLTYPE GetExecutionInterface(IUnknown** executionInterface) const noexcept override; @@ -481,8 +477,7 @@ class OpKernelContextWrapper : public WRL::Base, publi // Temporary allocations created by the kernel. These will be freed to the allocator following // Compute being called on the kernel. This list is used to maintain their lifetime. - mutable std::vector> m_temporaryAllocations; - mutable std::vector> m_temporaryAbiAllocations; + mutable std::vector m_temporaryBuffers; }; class AbiOpKernel : public onnxruntime::OpKernel diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index be5b6a1fe9ada..e545f33fdb8d5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -416,23 +416,6 @@ class GpuDFTOperator : public WRL::Base auto inputResource = loopList[0].Resource.Get(); auto outputResource = loopList[stockhamParams.OutputIndex].Resource.Get(); - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[2]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputResource, - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputResource, - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(2, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_rootSignature.Get()); commandList->SetPipelineState(m_pipelineState.Get()); @@ -471,21 +454,6 @@ class GpuDFTOperator : public WRL::Base constants.DFTIteration = index + 1; Dispatch(in, out, constants, commandList); } - - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputResource, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputResource, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index 2d99c8a6dd6df..3801ff6c28404 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -3,7 +3,7 @@ #include "precomp.h" #include "DmlOperator.h" -#include "../DmlManagedBufferRegion.h" +#include "../DmlManagedBuffer.h" namespace Dml { @@ -94,14 +94,9 @@ namespace Dml UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { - ComPtr managedBufferRegion; - ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource( - static_cast(persistentResourceSize), - managedBufferRegion.GetAddressOf())); - - managedBufferRegion.As(&m_persistentResourcePoolingUnk); - m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); - m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); + auto buffer = m_executionProvider->AllocatePooledResource(persistentResourceSize); + m_persistentResourceBinding = buffer.GetBufferBinding(); + m_managedPersistentBuffer = wil::MakeOrThrow(std::move(buffer)); } std::vector initializationInputBindings(m_kernelInputIndices.size()); @@ -194,14 +189,9 @@ namespace Dml UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { - ComPtr managedBufferRegion; - ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource( - static_cast(persistentResourceSize), - managedBufferRegion.GetAddressOf())); - - managedBufferRegion.As(&m_persistentResourcePoolingUnk); - m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); - m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); + auto buffer = m_executionProvider->AllocatePooledResource(persistentResourceSize); + m_persistentResourceBinding = buffer.GetBufferBinding(); + m_managedPersistentBuffer = wil::MakeOrThrow(std::move(buffer)); } std::vector initializationInputBindings(m_kernelInputIndices.size()); @@ -229,18 +219,11 @@ namespace Dml UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { - if (!m_persistentResource || m_persistentResource->GetDesc().Width < persistentResourceSize) + if (!m_managedPersistentBuffer || m_managedPersistentBuffer->SizeInBytes() < persistentResourceSize) { - m_persistentResource = nullptr; - - ComPtr managedBufferRegion; - ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource( - static_cast(persistentResourceSize), - managedBufferRegion.GetAddressOf())); - - managedBufferRegion.As(&m_persistentResourcePoolingUnk); - m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState(); - m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding(); + auto buffer = m_executionProvider->AllocatePooledResource(persistentResourceSize); + m_persistentResourceBinding = buffer.GetBufferBinding(); + m_managedPersistentBuffer = wil::MakeOrThrow(std::move(buffer)); } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h index 493cc2e44577a..a5f880dd0ec24 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h @@ -4,6 +4,7 @@ #pragma once #include "OperatorUtility.h" +#include "../DmlManagedBuffer.h" namespace Dml { @@ -25,8 +26,7 @@ namespace Dml std::vector m_outputTensorDescs; ComPtr m_compiledOperator; - ComPtr m_persistentResource; - ComPtr m_persistentResourcePoolingUnk; // Controls when the persistent resource is returned to the pool + ComPtr m_managedPersistentBuffer; std::optional m_persistentResourceBinding; void Initialize( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h new file mode 100644 index 0000000000000..bff622726219a --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h @@ -0,0 +1,97 @@ +/* Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#pragma once + +#include "dml_common.h" +#include "tfdml/runtime_adapter/macros.h" + +namespace tfdml +{ + +class D3D12HeapAllocator; + +// Represents a region of a D3D12 buffer resource. A buffer region has an +// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in +// bytes from the beginning of that buffer, and a size in bytes of the region. +class D3D12BufferRegion +{ + public: + D3D12BufferRegion() = default; + + // References a region of a buffer. The respective ID3D12Resource objects + // must be in the appropriate states. Each resource is optional, but if more + // than one are provided they must map to the same region of memory. + D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state); + + // Move-only + D3D12BufferRegion(const D3D12BufferRegion&) = delete; + D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete; + D3D12BufferRegion(D3D12BufferRegion&&); + D3D12BufferRegion& operator=(D3D12BufferRegion&&); + + ID3D12Resource* ResourceInUavState() const; + + // NOTE: may be any state that is valid as a copy source (COPY_SRC, + // GENERIC_READ, or COMMON). + ID3D12Resource* ResourceInCopySrcState() const; + + ID3D12Resource* ResourceInCopyDstState() const; + + uint64_t Offset() const; + uint64_t SizeInBytes() const; + + DML_BUFFER_BINDING GetBufferBinding() const; + + explicit operator bool() const { return first_valid_resource_ != nullptr; } + + // Creates a subregion at an offset from the start of this region. If no + // size is provided the region runs to the end of the current region. + inline D3D12BufferRegion Subregion( + uint64_t offset, + uint64_t size_in_bytes = 0) const + { + // start of subregion must be within current region + CHECK(offset < size_in_bytes_); + size_in_bytes = + size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + // end of subregion must be within current region + CHECK(size_in_bytes <= size_in_bytes_ - offset); + + return D3D12BufferRegion( + offset_ + offset, + size_in_bytes, + resource_uav_state_, + resource_copy_src_state_, + resource_copy_dst_state_); + } + + private: + ID3D12Resource* resource_uav_state_ = nullptr; + ID3D12Resource* resource_copy_src_state_ = nullptr; + ID3D12Resource* resource_copy_dst_state_ = nullptr; + uint64_t offset_ = 0; + uint64_t size_in_bytes_ = 0; + + // Pointer to the first resource above that isn't null. + ID3D12Resource* first_valid_resource_ = nullptr; +}; + +} // namespace tfdml From b71a5ffa16f4d0eecdccfca007e65744f51961d6 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 27 Jan 2023 14:16:33 -0800 Subject: [PATCH 18/76] WIP --- .../inc/IWinmlExecutionProvider.h | 8 +- .../src/DmlBufferRegion.h | 4 +- .../src/DmlGraphFusionHelper.cpp | 9 +- .../src/DmlGraphFusionHelper.h | 4 +- .../src/ExecutionProvider.cpp | 4 +- .../src/ExecutionProvider.h | 2 +- .../src/FusedGraphKernel.cpp | 11 +-- .../src/MLOperatorAuthorImpl.cpp | 47 +++++---- .../src/MLOperatorAuthorImpl.h | 5 + .../src/Operators/DmlDFT.h | 58 ++++------- .../src/dml_buffer_region.h | 97 ------------------- 11 files changed, 68 insertions(+), 181 deletions(-) delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index ccde56e5d712d..198d38c348c87 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -9,6 +9,7 @@ #include #include "core/framework/op_kernel.h" +#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h" struct AbstractOperatorDesc; interface IMLOperatorTensor; @@ -22,11 +23,6 @@ namespace onnxruntime class Node; } -namespace Dml -{ - class DmlManagedBufferRegion; -} - namespace Windows::AI::MachineLearning::Adapter { interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698")) @@ -39,7 +35,7 @@ namespace Windows::AI::MachineLearning::Adapter // the provider's underlying queues. virtual void QueueReference(IUnknown *object) = 0; - virtual ID3D12Resource* GetABIDataInterface(void* data) const = 0; + virtual Dml::D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const = 0; virtual uint64_t TryGetPooledAllocationId( void* data, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index 29a6bf6f7c775..dee01a29fe55f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -24,8 +24,8 @@ namespace Dml ID3D12Resource* resource_copy_dst_state); // Move-only - D3D12BufferRegion(const D3D12BufferRegion&) = delete; - D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete; + D3D12BufferRegion(const D3D12BufferRegion&) = default; + D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; D3D12BufferRegion(D3D12BufferRegion&&); D3D12BufferRegion& operator=(D3D12BufferRegion&&); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index ffd388f91cace..58dd7314b929f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -1,6 +1,7 @@ #pragma once #include "DmlGraphFusionHelper.h" +#include "DmlBufferRegion.h" namespace Dml @@ -88,18 +89,14 @@ namespace DmlGraphFusionHelper return buffer; } - void UnwrapTensor( + D3D12BufferRegion UnwrapTensor( Windows::AI::MachineLearning::Adapter::IWinmlExecutionProvider* winmlProvider, const onnxruntime::Tensor* tensor, - ID3D12Resource** resource, uint64_t* allocId) { void* opaqueData = const_cast(tensor->DataRaw()); - ID3D12Resource* abiDataInterface = winmlProvider->GetABIDataInterface(opaqueData); - abiDataInterface->AddRef(); - *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0); - *resource = abiDataInterface; + return winmlProvider->GetBufferRegion(opaqueData, tensor->SizeInBytes()); } void ProcessInputData( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h index f2533bb37bccb..593bd9b563ab6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h @@ -6,6 +6,7 @@ #include "GraphPartitioner.h" #include "FusedGraphKernel.h" #include "MLOperatorAuthorImpl.h" +#include "DmlBufferRegion.h" namespace Dml @@ -33,10 +34,9 @@ namespace DmlGraphFusionHelper const std::byte* tensorPtr, size_t tensorByteSize); - void UnwrapTensor( + D3D12BufferRegion UnwrapTensor( Windows::AI::MachineLearning::Adapter::IWinmlExecutionProvider* winmlProvider, const onnxruntime::Tensor* tensor, - ID3D12Resource** resource, uint64_t* allocId); std::unordered_map> diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 20f8feed12311..0f4fe3788cbd0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -850,9 +850,9 @@ namespace Dml m_context->QueueReference(object); } - ID3D12Resource* ExecutionProviderImpl::GetABIDataInterface(void* data) const + D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(void* data, uint64_t size) const { - return m_gpuAllocator->GetAllocationInfo(data)->GetUavResource(); + return m_gpuAllocator->CreateBufferRegion(data, size); } uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 20cb307b1cdb4..5ffced302d5c5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -97,7 +97,7 @@ namespace Dml // IWinmlExecutionProvider methods void QueueReference(IUnknown* object) override; - ID3D12Resource* GetABIDataInterface(void* data) const override; + D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const override; uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 9ecaae2c50394..5f29bae1b4fdc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -302,10 +302,10 @@ namespace Dml const onnxruntime::Tensor* tensor = kernelContext->Input(i); uint64_t allocId; - DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &inputBindings[i].Buffer, &allocId); + auto bufferRegion = DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &allocId); + + inputBindings[i] = bufferRegion.GetBufferBinding(); inputBindingsChanged = inputBindingsChanged || (!allocId || m_inputBindingAllocIds[i] != allocId); - inputBindings[i].Buffer->Release(); // Avoid holding an additional reference - inputBindings[i].SizeInBytes = DmlGraphFusionHelper::AlignToPow2(tensor->SizeInBytes(), 4); inputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &inputBindings[i]}; m_inputBindingAllocIds[i] = allocId; } @@ -339,10 +339,9 @@ namespace Dml ); uint64_t allocId; - DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &outputBindings[i].Buffer, &allocId); + auto bufferRegion = DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &allocId); + outputBindings[i] = bufferRegion.GetBufferBinding(); outputBindingsChanged = outputBindingsChanged || (!allocId || m_outputBindingAllocIds[i] != allocId); - outputBindings[i].Buffer->Release(); // Avoid holding an additional reference - outputBindings[i].SizeInBytes = DmlGraphFusionHelper::AlignToPow2(tensor->SizeInBytes(), 4); outputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &outputBindings[i]}; m_outputBindingAllocIds[i] = allocId; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 55a33cc8513b5..082b193840e94 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -7,10 +7,11 @@ #include "core/framework/execution_frame.h" #include "core/session/onnxruntime_c_api.h" #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h" +#include "DmlBufferRegion.h" #include "MLOperatorAuthorImpl.h" #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h" -#include "core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h" +#include "DmlGpuAllocator.h" using namespace Microsoft::WRL; @@ -100,14 +101,6 @@ namespace Windows::AI::MachineLearning::Adapter return strcmp(info.name, onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput || info.mem_type == ::OrtMemType::OrtMemTypeCPUInput); } - // Translate the data object stored in a tensor to the type which will be returned through - // the ABI. The translation is determined by the provider and based on options with which the - // kernels are registered. - ID3D12Resource* TranslateAllocationDataToAbi(IWinmlExecutionProvider* winmlProvider, void* opaqueData) - { - return winmlProvider->GetABIDataInterface(opaqueData); - } - // // Traits for numeric attribute types // @@ -1271,19 +1264,14 @@ namespace Windows::AI::MachineLearning::Adapter if (impl) { m_tensorData = m_impl->MutableDataRaw(); - - if (isDataInterface) - { - if (m_tensorData) - { - // Get the actual object to be returned from the ABI, which varies for internal and external - // kernels (i.e. ID3D12Resource, versus something that tracks the layout). - m_abiDataInterface = TranslateAllocationDataToAbi(m_winmlExecutionProvider.Get(), m_tensorData); - } - } } } + Dml::D3D12BufferRegion TensorWrapper::GetBufferRegion() const + { + return m_winmlExecutionProvider->GetBufferRegion(m_tensorData, m_impl->SizeInBytes()); + } + uint32_t STDMETHODCALLTYPE TensorWrapper::GetDimensionCount() const noexcept { if (IsClosed()) @@ -1370,8 +1358,9 @@ namespace Windows::AI::MachineLearning::Adapter } else { - m_abiDataInterface->AddRef(); - *dataInterface = m_abiDataInterface; + auto bufferRegion = GetBufferRegion(); + bufferRegion.ResourceInUavState()->AddRef(); + *dataInterface = bufferRegion.ResourceInUavState(); } } @@ -1575,6 +1564,22 @@ namespace Windows::AI::MachineLearning::Adapter ORT_CATCH_RETURN } + const Dml::D3D12BufferRegion& OpKernelContextWrapper::AllocateDefaultBuffer(size_t size) + { + VerifyNotClosed(); + + onnxruntime::AllocatorPtr alloc; + THROW_IF_NOT_OK(m_impl->GetTempSpaceAllocator(&alloc)); + + ORT_THROW_HR_IF(E_FAIL, !IsAllocationInterface(alloc->Info())); + auto dml_gpu_allocator = static_cast(alloc.get()); + auto buffer = dml_gpu_allocator->AllocateDefaultBuffer(size); + + // Ensure the allocation is freed and transitioned when the context destructs + m_temporaryBuffers.push_back(std::move(buffer)); + return m_temporaryBuffers.back().Region(); + } + void STDMETHODCALLTYPE OpKernelContextWrapper::GetExecutionInterface(IUnknown** executionInterface) const noexcept { m_abiExecutionObject.CopyTo(executionInterface); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 31f7e3fbeee8b..c8a11b01defda 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -10,6 +10,8 @@ #include #include #include "core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h" +#include "DmlBufferRegion.h" +#include "DmlBuffer.h" interface IDMLOperator; @@ -265,6 +267,8 @@ class TensorWrapper : public WRL::Base, public Closable MLOperatorTensorDataType STDMETHODCALLTYPE GetTensorDataType() const noexcept override; + Dml::D3D12BufferRegion GetBufferRegion() const; + bool STDMETHODCALLTYPE IsCpuData() const noexcept override; bool STDMETHODCALLTYPE IsDataInterface() const noexcept override; @@ -455,6 +459,7 @@ class OpKernelContextWrapper : public WRL::Base, publi std::vector GetInputTensors(); std::vector GetOutputTensors(const EdgeShapes& outputShapes); + const Dml::D3D12BufferRegion& AllocateDefaultBuffer(uint64_t size); protected: void ClearTempAllocations(); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index e545f33fdb8d5..5aabea1eeedf5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -2,9 +2,10 @@ #include "../MLOperatorAuthorImpl.h" #include "../../../OperatorAuthorHelper/OperatorHelper.h" - #include "../External/D3DX12/d3dx12.h" +#include "../DmlBufferRegion.h" + // The shader header is produced using "fxc.exe dft_shader.hlsl -E DFT -T cs_5_0 -Zi /Fh" #include "GeneratedShaders/stockham.h" @@ -76,7 +77,7 @@ class GpuDFTOperator : public WRL::Base // Allocate temporary buffers if needed struct ResourceDesc { - ComPtr Resource; + Dml::D3D12BufferRegion BufferRegion; std::array Sizes; std::array Strides; }; @@ -266,7 +267,6 @@ class GpuDFTOperator : public WRL::Base ComPtr outputTensor; ORT_THROW_IF_FAILED(context->GetOutputTensor(0, &outputTensor)); auto outputDims = GetTensorDimensions(outputTensor.Get()); - ORT_THROW_HR_IF(E_FAIL, inputDims.size() != outputDims.size()); // Get optional dft_length input @@ -292,16 +292,6 @@ class GpuDFTOperator : public WRL::Base params.Type = DFTType::Stockham; params.StockhamParams = {}; - ComPtr inputUnknown; - ComPtr inputResource; - inputTensor->GetDataInterface(inputUnknown.GetAddressOf()); - inputUnknown.As(&inputResource); - - ComPtr outputUnknown; - ComPtr outputResource; - outputTensor->GetDataInterface(outputUnknown.GetAddressOf()); - outputUnknown.As(&outputResource); - // { before_dft_axis, axis, after_dft_axis, real_or_complex } std::array reshapedInputSize = { 1, 1, 1, inputDims.back() }; std::array reshapedOutputSize = { 1, 1, 1, outputDims.back() }; @@ -349,11 +339,14 @@ class GpuDFTOperator : public WRL::Base // Create the resource loop list // Add the input resource to the loop list + auto inputTensorWrapper = static_cast(inputTensor.Get()); params.StockhamParams.ResourceLoopList.push_back({}); - params.StockhamParams.ResourceLoopList.back().Resource = inputResource; + params.StockhamParams.ResourceLoopList.back().BufferRegion = inputTensorWrapper->GetBufferRegion(); params.StockhamParams.ResourceLoopList.back().Sizes = reshapedInputSize; params.StockhamParams.ResourceLoopList.back().Strides = reshapedInputStrides; + auto kernelContext = static_cast(context); + // If 1 temporary should be placed first, or multiple temporaries, then // Add a temp in the list if (oscillateFirstTemporaryThenOutput || oscillateBetweenTwoTemporaries) @@ -361,9 +354,7 @@ class GpuDFTOperator : public WRL::Base params.StockhamParams.ResourceLoopList.push_back({}); params.StockhamParams.ResourceLoopList.back().Sizes = temporarySize; params.StockhamParams.ResourceLoopList.back().Strides = temporaryStrides; - - auto& resource = params.StockhamParams.ResourceLoopList.back().Resource; - ORT_THROW_IF_FAILED(context->AllocateTemporaryData(temporaryBufferByteSize, &resource)); + auto& resource = params.StockhamParams.ResourceLoopList.back().BufferRegion = kernelContext->AllocateDefaultBuffer(temporaryBufferByteSize); } // If 2 temps, add another @@ -372,14 +363,13 @@ class GpuDFTOperator : public WRL::Base params.StockhamParams.ResourceLoopList.push_back({}); params.StockhamParams.ResourceLoopList.back().Sizes = temporarySize; params.StockhamParams.ResourceLoopList.back().Strides = temporaryStrides; - - auto& resource = params.StockhamParams.ResourceLoopList.back().Resource; - ORT_THROW_IF_FAILED(context->AllocateTemporaryData(temporaryBufferByteSize, &resource)); + auto& resource = params.StockhamParams.ResourceLoopList.back().BufferRegion = kernelContext->AllocateDefaultBuffer(temporaryBufferByteSize); } // Add output resource + auto outputTensorWrapper = static_cast(outputTensor.Get()); params.StockhamParams.ResourceLoopList.push_back({}); - params.StockhamParams.ResourceLoopList.back().Resource = outputResource; + params.StockhamParams.ResourceLoopList.back().BufferRegion = outputTensorWrapper->GetBufferRegion(); params.StockhamParams.ResourceLoopList.back().Sizes = reshapedOutputSize; params.StockhamParams.ResourceLoopList.back().Strides = reshapedOutputStrides; params.StockhamParams.OutputIndex = static_cast(params.StockhamParams.ResourceLoopList.size() - 1); @@ -390,9 +380,7 @@ class GpuDFTOperator : public WRL::Base params.StockhamParams.ResourceLoopList.push_back({}); params.StockhamParams.ResourceLoopList.back().Sizes = temporarySize; params.StockhamParams.ResourceLoopList.back().Strides = temporaryStrides; - - auto& resource = params.StockhamParams.ResourceLoopList.back().Resource; - ORT_THROW_IF_FAILED(context->AllocateTemporaryData(temporaryBufferByteSize, &resource)); + auto& resource = params.StockhamParams.ResourceLoopList.back().BufferRegion = kernelContext->AllocateDefaultBuffer(temporaryBufferByteSize); } // Define the loop range @@ -413,8 +401,8 @@ class GpuDFTOperator : public WRL::Base const auto& loopList = stockhamParams.ResourceLoopList; // Get input and output resources - auto inputResource = loopList[0].Resource.Get(); - auto outputResource = loopList[stockhamParams.OutputIndex].Resource.Get(); + auto inputResource = loopList[0].BufferRegion.ResourceInUavState(); + auto outputResource = loopList[stockhamParams.OutputIndex].BufferRegion.ResourceInUavState(); // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_rootSignature.Get()); @@ -432,11 +420,11 @@ class GpuDFTOperator : public WRL::Base auto inIdx = stockhamParams.LoopRange.CalculateIndex(index); auto outIdx = stockhamParams.LoopRange.CalculateIndex(index + 1); - auto in = loopList[inIdx].Resource.Get(); + const auto& in = loopList[inIdx].BufferRegion; std::copy(loopList[inIdx].Sizes.begin(), loopList[inIdx].Sizes.end(), constants.InputSizes); std::copy(loopList[inIdx].Strides.begin(), loopList[inIdx].Strides.end(), constants.InputStrides); - auto out = loopList[outIdx].Resource.Get(); + const auto& out = loopList[outIdx].BufferRegion; std::copy(loopList[outIdx].Sizes.begin(), loopList[outIdx].Sizes.end(), constants.OutputSizes); std::copy(loopList[outIdx].Strides.begin(), loopList[outIdx].Strides.end(), constants.OutputStrides); @@ -465,24 +453,20 @@ class GpuDFTOperator : public WRL::Base } void Dispatch( - ID3D12Resource* inputResource, - ID3D12Resource* outputResource, + const Dml::D3D12BufferRegion& inputBufferRegion, + const Dml::D3D12BufferRegion& outputBufferRegion, DFTShaderConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[2]; - uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(inputResource); - uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(outputResource); - commandList->ResourceBarrier(2, uav_barriers); // Set resource views commandList->SetComputeRootUnorderedAccessView( 0, // root parameter index - inputResource->GetGPUVirtualAddress() + inputBufferRegion.ResourceInUavState()->GetGPUVirtualAddress() + inputBufferRegion.Offset() ); commandList->SetComputeRootUnorderedAccessView( 1, // root parameter index - outputResource->GetGPUVirtualAddress() + outputBufferRegion.ResourceInUavState()->GetGPUVirtualAddress() + outputBufferRegion.Offset() ); auto pendingElementCount = constants.ElementCount; @@ -512,8 +496,6 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - - commandList->ResourceBarrier(2, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h deleted file mode 100644 index bff622726219a..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) Microsoft Corporation. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once - -#include "dml_common.h" -#include "tfdml/runtime_adapter/macros.h" - -namespace tfdml -{ - -class D3D12HeapAllocator; - -// Represents a region of a D3D12 buffer resource. A buffer region has an -// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in -// bytes from the beginning of that buffer, and a size in bytes of the region. -class D3D12BufferRegion -{ - public: - D3D12BufferRegion() = default; - - // References a region of a buffer. The respective ID3D12Resource objects - // must be in the appropriate states. Each resource is optional, but if more - // than one are provided they must map to the same region of memory. - D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state); - - // Move-only - D3D12BufferRegion(const D3D12BufferRegion&) = delete; - D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete; - D3D12BufferRegion(D3D12BufferRegion&&); - D3D12BufferRegion& operator=(D3D12BufferRegion&&); - - ID3D12Resource* ResourceInUavState() const; - - // NOTE: may be any state that is valid as a copy source (COPY_SRC, - // GENERIC_READ, or COMMON). - ID3D12Resource* ResourceInCopySrcState() const; - - ID3D12Resource* ResourceInCopyDstState() const; - - uint64_t Offset() const; - uint64_t SizeInBytes() const; - - DML_BUFFER_BINDING GetBufferBinding() const; - - explicit operator bool() const { return first_valid_resource_ != nullptr; } - - // Creates a subregion at an offset from the start of this region. If no - // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion( - uint64_t offset, - uint64_t size_in_bytes = 0) const - { - // start of subregion must be within current region - CHECK(offset < size_in_bytes_); - size_in_bytes = - size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; - // end of subregion must be within current region - CHECK(size_in_bytes <= size_in_bytes_ - offset); - - return D3D12BufferRegion( - offset_ + offset, - size_in_bytes, - resource_uav_state_, - resource_copy_src_state_, - resource_copy_dst_state_); - } - - private: - ID3D12Resource* resource_uav_state_ = nullptr; - ID3D12Resource* resource_copy_src_state_ = nullptr; - ID3D12Resource* resource_copy_dst_state_ = nullptr; - uint64_t offset_ = 0; - uint64_t size_in_bytes_ = 0; - - // Pointer to the first resource above that isn't null. - ID3D12Resource* first_valid_resource_ = nullptr; -}; - -} // namespace tfdml From 06caff8a2a9767b4a1e3130c35a80982c8425ce4 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 27 Jan 2023 16:13:28 -0800 Subject: [PATCH 19/76] WIP --- .../dml/DmlExecutionProvider/src/Operators/DmlDFT.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index 5aabea1eeedf5..403e660b0e08c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -458,6 +458,11 @@ class GpuDFTOperator : public WRL::Base DFTShaderConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[2]; + uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(inputBufferRegion.ResourceInUavState()); + uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(outputBufferRegion.ResourceInUavState()); + commandList->ResourceBarrier(2, uav_barriers); + // Set resource views commandList->SetComputeRootUnorderedAccessView( 0, // root parameter index @@ -496,6 +501,8 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } + + commandList->ResourceBarrier(2, uav_barriers); } }; From e7667f1852c210dd04060780bc62f46b10181fa9 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 27 Jan 2023 16:31:33 -0800 Subject: [PATCH 20/76] WIP --- .../dml/DmlExecutionProvider/src/Operators/DmlDFT.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index 403e660b0e08c..aead38c872e45 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -458,10 +458,11 @@ class GpuDFTOperator : public WRL::Base DFTShaderConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[2]; + D3D12_RESOURCE_BARRIER uav_barriers[3]; uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(inputBufferRegion.ResourceInUavState()); uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(outputBufferRegion.ResourceInUavState()); - commandList->ResourceBarrier(2, uav_barriers); + uav_barriers[2] = CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr); + commandList->ResourceBarrier(3, uav_barriers); // Set resource views commandList->SetComputeRootUnorderedAccessView( @@ -502,7 +503,7 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(2, uav_barriers); + commandList->ResourceBarrier(3, uav_barriers); } }; From a95d434117af203bb41abdcf1b9f4c64b989f834 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 27 Jan 2023 23:44:28 -0800 Subject: [PATCH 21/76] WIP --- .../onnxruntime/core/framework/ortdevice.h | 1 + onnxruntime/core/framework/allocator.cc | 2 +- .../inc/DmlExecutionProvider.h | 2 +- .../inc/IWinmlExecutionProvider.h | 9 +++++++-- .../src/BucketizedBufferAllocator.cpp | 18 +++++------------- .../src/BucketizedBufferAllocator.h | 5 +++-- .../src/DmlBfcAllocator.h | 4 ++-- .../DmlExecutionProvider/src/DmlBuffer.cpp | 3 ++- .../src/DmlGpuAllocator.cpp | 9 +++++---- .../src/DmlGpuAllocator.h | 5 +++-- .../src/DmlGraphFusionHelper.cpp | 16 ++++++++++++++-- .../src/ExecutionProvider.cpp | 19 +++++++++---------- .../src/ExecutionProvider.h | 5 +++-- .../src/MLOperatorAuthorImpl.cpp | 11 ++++++++++- .../providers/dml/dml_provider_factory.cc | 5 +++-- 15 files changed, 69 insertions(+), 45 deletions(-) diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h index 77f7c3e1743f0..962445182e6ee 100644 --- a/include/onnxruntime/core/framework/ortdevice.h +++ b/include/onnxruntime/core/framework/ortdevice.h @@ -23,6 +23,7 @@ struct OrtDevice { static const MemoryType CUDA_PINNED = 1; static const MemoryType HIP_PINNED = 2; static const MemoryType CANN_PINNED = 3; + static const MemoryType DML_EXTERNAL = 4; }; constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_) diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index a0d20974624be..08e1221cb7977 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -152,7 +152,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA id1, mem_type1); } else if (strcmp(name1, onnxruntime::DML) == 0) { *out = new OrtMemoryInfo( - onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast(id1)), + onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, static_cast(id1)), id1, mem_type1); } else { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported."); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index fe07ccf08899e..c062ff81d1330 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -30,7 +30,7 @@ namespace Dml ID3D12CommandQueue* commandQueue, bool enableMetacommands = true); - ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr); + ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index 198d38c348c87..a3acff6b2f4ae 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -23,6 +23,11 @@ namespace onnxruntime class Node; } +namespace Dml +{ + struct TaggedPointer; +} + namespace Windows::AI::MachineLearning::Adapter { interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698")) @@ -35,10 +40,10 @@ namespace Windows::AI::MachineLearning::Adapter // the provider's underlying queues. virtual void QueueReference(IUnknown *object) = 0; - virtual Dml::D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const = 0; + virtual Dml::D3D12BufferRegion GetBufferRegion(const Dml::TaggedPointer& taggedPointer, uint64_t size) const = 0; virtual uint64_t TryGetPooledAllocationId( - void* data, + const Dml::TaggedPointer& taggedPointer, bool isInternalOperator) = 0; virtual void GetABIExecutionInterfaceAndInvalidateState( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index b0ddd29e6bf46..e2ae8f23a3744 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -380,18 +380,14 @@ namespace Dml } D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion( - const void* ptr, + const TaggedPointer& taggedPointer, uint64_t size_in_bytes) { - ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); - - TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); - // We need to access (mutable) state after this point, so we need to lock std::unique_lock lock(mutex_); // Find the allocation corresponding to this pointer - auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + auto it = allocations_by_id_.find(taggedPointer.allocation_id); ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); // Make sure that we are aligned to 4 bytes to satisfy DML's requirements @@ -400,24 +396,20 @@ namespace Dml (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; return D3D12BufferRegion( - tagged_ptr.offset, + taggedPointer.offset, size_in_bytes, it->second->GetUavResource(), it->second->GetCopySrcResource(), it->second->GetCopyDstResource()); } - AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const void* ptr) + AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) { - ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); - - TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); - // We need to access (mutable) state after this point, so we need to lock std::unique_lock lock(mutex_); // Find the allocation corresponding to this pointer - auto it = allocations_by_id_.find(tagged_ptr.allocation_id); + auto it = allocations_by_id_.find(taggedPointer.allocation_id); return it->second.Get(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index f2c09dfa0cfc4..370a7a6ff1e8d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -11,6 +11,7 @@ namespace Dml { class BucketizedBufferAllocator; class BucketizedBufferAllocator; + struct TaggedPointer; // An allocator that makes logically contiguous allocations backed by D3D heaps. // @@ -60,10 +61,10 @@ namespace Dml // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. D3D12BufferRegion CreateBufferRegion( - const void* ptr, + const TaggedPointer& taggedPointer, uint64_t size_in_bytes); - AllocationInfo* GetAllocationInfo(const void* ptr); + AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); void* Alloc(size_t size_in_bytes); void Free(void* ptr); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h index f43aa769af0a9..c00b820434592 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h @@ -21,8 +21,8 @@ namespace Dml ), m_subAllocator(std::move(subAllocator)) {} - void* Alloc(size_t size_in_bytes) { return m_subAllocator->Alloc(size_in_bytes); } - void Free(void* ptr) { m_subAllocator->Free(ptr); } + void* Alloc(size_t size_in_bytes) final { return m_subAllocator->Alloc(size_in_bytes); } + void Free(void* ptr) final { m_subAllocator->Free(ptr); } private: std::shared_ptr m_subAllocator; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index 6f587261553e6..c5fa576d24a0f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -4,6 +4,7 @@ #include "precomp.h" #include "DmlBuffer.h" #include "DmlGpuAllocator.h" +#include "DmlTaggedPointer.h" namespace Dml { @@ -14,7 +15,7 @@ namespace Dml m_opaqueData = allocator_->Alloc(size_in_bytes); ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr); - buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes); + buffer_region_ = allocator_->CreateBufferRegion(TaggedPointer::Unpack(m_opaqueData), size_in_bytes); } DmlBuffer::~DmlBuffer() diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 13e0d8dfe96f7..5370515afffd1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -7,6 +7,7 @@ #include "DmlGpuAllocator.h" #include "core/framework/allocator.h" #include "BucketizedBufferAllocator.h" +#include "DmlTaggedPointer.h" namespace Dml { @@ -31,14 +32,14 @@ namespace Dml m_bfcAllocator->Free(ptr); } - D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const void* ptr, uint64_t size_in_bytes) + D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes) { - return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); + return m_subAllocator->CreateBufferRegion(taggedPointer, size_in_bytes); } - AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const void* ptr) + AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) { - return m_subAllocator->GetAllocationInfo(ptr); + return m_subAllocator->GetAllocationInfo(taggedPointer); } void DmlGpuAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index 5ef9ea855753f..3bc8127598460 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -11,6 +11,7 @@ namespace Dml { class BucketizedBufferAllocator; class AllocationInfo; + struct TaggedPointer; class DmlGpuAllocator : public onnxruntime::IAllocator { @@ -19,8 +20,8 @@ namespace Dml void* Alloc(size_t size_in_bytes) final; void Free(void* ptr) final; - D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes); - AllocationInfo* GetAllocationInfo(const void* ptr); + D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); + AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index 58dd7314b929f..52e0f287e6594 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -2,6 +2,8 @@ #include "DmlGraphFusionHelper.h" #include "DmlBufferRegion.h" +#include "DmlTaggedPointer.h" +#include "DmlAllocationInfo.h" namespace Dml @@ -95,8 +97,18 @@ namespace DmlGraphFusionHelper uint64_t* allocId) { void* opaqueData = const_cast(tensor->DataRaw()); - *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0); - return winmlProvider->GetBufferRegion(opaqueData, tensor->SizeInBytes()); + + if (tensor->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) + { + // The allocation is not pooled + auto allocInfo = static_cast(opaqueData); + *allocId = allocInfo->GetPooledResourceId(); + return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + } + + auto taggedPointer = TaggedPointer::Unpack(opaqueData); + *allocId = winmlProvider->TryGetPooledAllocationId(taggedPointer, 0); + return winmlProvider->GetBufferRegion(taggedPointer, tensor->SizeInBytes()); } void ProcessInputData( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 0f4fe3788cbd0..0a6859f7b615f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -23,6 +23,7 @@ #include "DmlBfcAllocator.h" #include "DmlGpuAllocator.h" #include "DmlBuffer.h" +#include "DmlTaggedPointer.h" #ifdef ERROR #undef ERROR @@ -120,10 +121,8 @@ namespace Dml D3D12BufferRegion ExecutionProviderImpl::GetBufferForTensor(IMLOperatorTensor* tensor) const { - MLOperatorTensor mlOperatorTensor(tensor); - void* data = mlOperatorTensor.GetByteData(); - auto sizeInBytes = mlOperatorTensor.GetUnalignedTensorByteSize(); - return m_gpuAllocator->CreateBufferRegion(data, sizeInBytes); + auto tensorWrapper = static_cast(tensor); + return tensorWrapper->GetBufferRegion(); } ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept @@ -850,15 +849,15 @@ namespace Dml m_context->QueueReference(object); } - D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(void* data, uint64_t size) const + D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const { - return m_gpuAllocator->CreateBufferRegion(data, size); + return m_gpuAllocator->CreateBufferRegion(taggedPointer, size); } - uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator) + uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) { assert(!isInternalOperator); - return m_gpuAllocator->GetAllocationInfo(data)->GetPooledResourceId(); + return m_gpuAllocator->GetAllocationInfo(taggedPointer)->GetPooledResourceId(); } void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState( @@ -981,10 +980,10 @@ namespace Dml return std::make_unique(dmlDevice, commandQueue, enableMetacommands); } - ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr) + ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer) { Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->GetAllocationInfo(ptr)->GetUavResource(); + return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource(); } void FlushContext(onnxruntime::IExecutionProvider* provider) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 5ffced302d5c5..b9787335b6ea0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -28,6 +28,7 @@ namespace Dml class DmlCpuAllocator; class ExecutionProvider; class DmlGpuAllocator; + struct TaggedPointer; class ExecutionProviderImpl : public WRL::Base @@ -97,9 +98,9 @@ namespace Dml // IWinmlExecutionProvider methods void QueueReference(IUnknown* object) override; - D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const override; + D3D12BufferRegion GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const override; - uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override; + uint64_t TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) override; void GetABIExecutionInterfaceAndInvalidateState( bool isInternalOperator, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 082b193840e94..d0d5d3a8b403e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -12,6 +12,8 @@ #include "MLOperatorAuthorImpl.h" #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h" #include "DmlGpuAllocator.h" +#include "DmlAllocationInfo.h" +#include "DmlTaggedPointer.h" using namespace Microsoft::WRL; @@ -1269,7 +1271,14 @@ namespace Windows::AI::MachineLearning::Adapter Dml::D3D12BufferRegion TensorWrapper::GetBufferRegion() const { - return m_winmlExecutionProvider->GetBufferRegion(m_tensorData, m_impl->SizeInBytes()); + if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) + { + auto allocInfo = static_cast(m_tensorData); + return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + } + + auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); + return m_winmlExecutionProvider->GetBufferRegion(taggedPointer, m_impl->SizeInBytes()); } uint32_t STDMETHODCALLTYPE TensorWrapper::GetDimensionCount() const noexcept diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 9f589a0d3ad41..ebcbae5e799de 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -20,6 +20,7 @@ using Microsoft::WRL::ComPtr; #include "core/framework/error_code_helper.h" #include "DmlExecutionProvider/src/ErrorHandling.h" #include "DmlExecutionProvider/src/GraphicsUnknownHelper.h" +#include "DmlExecutionProvider/src/DmlTaggedPointer.h" #include "DmlExecutionProvider/inc/DmlExecutionProvider.h" #include "core/platform/env.h" @@ -100,7 +101,7 @@ bool IsSoftwareAdapter(IDXGIAdapter1* adapter) { auto isBasicRenderDriverVendorId = desc.VendorId == 0x1414; auto isBasicRenderDriverDeviceId = desc.DeviceId == 0x8c; auto isSoftwareAdapter = desc.Flags == DXGI_ADAPTER_FLAG_SOFTWARE; - + return isSoftwareAdapter || (isBasicRenderDriverVendorId && isBasicRenderDriverDeviceId); } @@ -217,7 +218,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc if (!allocator) { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available"); } - *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), allocation); + *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation)); (*d3d_resource)->AddRef(); #else *d3d_resource = nullptr; From 0729ea294f289a03bf8cefa0ae3bf0532ea766c6 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 30 Jan 2023 14:17:19 -0800 Subject: [PATCH 22/76] Fix --- winml/lib/Api.Ort/OnnxruntimeEngine.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp index ee45ceac9493a..905a3e6866f02 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp @@ -632,11 +632,6 @@ HRESULT OnnxruntimeEngine::CreateTensorValueFromExternalD3DResource(ID3D12Resour RETURN_HR_IF_NOT_OK_MSG(ort_api->CreateMemoryInfo("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info), ort_api); - OrtAllocator* ort_allocator; - RETURN_HR_IF_NOT_OK_MSG(ort_api->CreateAllocator(session_.get(), ort_memory_info, &ort_allocator), - ort_api); - auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator); - void* dml_allocator_resource; RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->CreateGPUAllocationFromD3DResource(d3d_resource, &dml_allocator_resource), engine_factory_->UseOrtApi()); From 544637f29588e8f83d2015cef57888b167d50a63 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 30 Jan 2023 15:16:45 -0800 Subject: [PATCH 23/76] Fix --- .../src/FusedGraphKernel.cpp | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 5f29bae1b4fdc..9dfc1672708c0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -109,7 +109,7 @@ namespace Dml // Get input resources for execution, excluding those which were specified as owned by DML and provided // at initialization instead. std::vector> inputTensors(kernelContext->InputCount()); - std::vector inputPtrs(kernelContext->InputCount()); + std::vector inputBufferRegions(kernelContext->InputCount()); for (int i = 0; i < kernelContext->InputCount(); ++i) { @@ -120,12 +120,18 @@ namespace Dml if (m_nonOwnedGraphInputsFromInitializers[i]) { - inputPtrs[i] = m_nonOwnedGraphInputsFromInitializers[i].Get(); + inputBufferRegions[i] = D3D12BufferRegion( + 0, + m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width, + m_nonOwnedGraphInputsFromInitializers[i].Get(), + nullptr, + nullptr); } else if (!m_isInputsUploadedByDmlEP[i]) { ORT_THROW_IF_FAILED(contextWrapper.GetInputTensor(i, inputTensors[i].GetAddressOf())); - inputPtrs[i] = m_provider->DecodeResource(inputTensors[i].Get()); + auto tensorWrapper = static_cast(inputTensors[i].Get()); + inputBufferRegions[i] = tensorWrapper->GetBufferRegion(); } } @@ -133,7 +139,7 @@ namespace Dml ExecuteOperator( m_compiledExecutionPlanOperator.Get(), m_persistentResourceBinding ? &*m_persistentResourceBinding : nullptr, - inputPtrs, + inputBufferRegions, aux); ORT_THROW_IF_FAILED(m_provider->AddUAVBarrier()); @@ -153,7 +159,7 @@ namespace Dml void ExecuteOperator( IDMLCompiledOperator* op, _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding, - gsl::span inputTensors, + gsl::span inputBufferRegions, gsl::span outputTensors) const { auto FillBindingsFromTensors = [this](auto& bufferBindings, auto& bindingDescs, gsl::span& tensors) @@ -162,10 +168,10 @@ namespace Dml { if (tensor) { + auto tensorWrapper = static_cast(tensor); + assert(tensor->IsDataInterface()); - ID3D12Resource* resource = m_provider->DecodeResource(tensor); - D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc(); - bufferBindings.push_back({ resource, 0, resourceDesc.Width }); + bufferBindings.push_back(tensorWrapper->GetBufferRegion().GetBufferBinding()); bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); } else @@ -176,29 +182,28 @@ namespace Dml } }; - auto FillBindingsFromBuffers = [](auto& bufferBindings, auto& bindingDescs, gsl::span& resources) + auto FillBindingsFromBufferRegions = [](auto& bufferBindings, auto& bindingDescs, gsl::span& bufferRegions) { - for (ID3D12Resource* resource : resources) + for (const D3D12BufferRegion& bufferRegion : bufferRegions) { - if (resource) + bufferBindings.push_back(bufferRegion.GetBufferBinding()); + + if (bufferRegion.ResourceInUavState() != nullptr) { - D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc(); - bufferBindings.push_back({ resource, 0, resourceDesc.Width }); bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); } else { - bufferBindings.push_back({ nullptr, 0, 0 }); bindingDescs.push_back({ DML_BINDING_TYPE_NONE, nullptr }); } } }; std::vector inputBufferBindings; - inputBufferBindings.reserve(inputTensors.size()); + inputBufferBindings.reserve(inputBufferRegions.size()); std::vector inputBindings; - inputBindings.reserve(inputTensors.size()); - FillBindingsFromBuffers(inputBufferBindings, inputBindings, inputTensors); + inputBindings.reserve(inputBufferRegions.size()); + FillBindingsFromBufferRegions(inputBufferBindings, inputBindings, inputBufferRegions); std::vector outputBufferBindings; outputBufferBindings.reserve(outputTensors.size()); From ea268552105499fed0c954acb9d3cad45d2f0289 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 31 Jan 2023 10:12:31 -0800 Subject: [PATCH 24/76] WIP --- .../src/Operators/DmlOperatorNonZero.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp index 62374963fff1b..61623dfe2b4dd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp @@ -65,10 +65,6 @@ class DmlOperatorNonZero: public DmlOperator nonzeroCoordinatesDesc.OutputCountTensor = &intermediateDescs[0]; nonzeroCoordinatesDesc.OutputCoordinatesTensor = &intermediateDescs[1]; - // TODO: Remove this hack when DML supports native int64 for NonZero - // We use the int64/uint32 stride hack here, so zero out the data before writing to it - m_zeroOperator = InitializeZeroInt64Tensor(m_intermediateTensorDescs[1].GetBufferSizeInBytes()); - DML_OPERATOR_DESC opDesc = { DML_OPERATOR_NONZERO_COORDINATES, &nonzeroCoordinatesDesc }; SetDmlOperatorDesc(opDesc, kernelCreationContext); } @@ -126,7 +122,11 @@ class DmlOperatorNonZero: public DmlOperator if (!m_emptyInput && nonzeroElementCount > 0) { + std::vector outputCoordinatesStrides = {nonzeroElementCount * 2, 2}; + TensorDesc stridedOutputTensorDesc(DML_TENSOR_DATA_TYPE_UINT32, outputSizes, outputCoordinatesStrides); + // TODO: Remove this hack when DML supports native int64 for NonZero + m_zeroOperator = InitializeZeroInt64Tensor(stridedOutputTensorDesc.GetBufferSizeInBytes()); ExecuteZeroInt64Tensor(m_zeroOperator.Get(), outputTensor.GetInterface().Get()); ComPtr sliceOperator = InitializeSlice(m_intermediateTensorDescs[1], nonzeroElementCount); From 61dce2e96d9dfb05d0db70795ea4ea86c06ab313 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 31 Jan 2023 14:18:13 -0800 Subject: [PATCH 25/76] WIP --- .../core/providers/dml/dml_provider_factory.h | 16 ++++----- onnxruntime/core/framework/allocator.cc | 4 ++- .../inc/DmlExecutionProvider.h | 1 - .../src/DmlBfcAllocator.h | 2 +- .../src/DmlExternalGpuAllocator.cpp | 33 +++++++++++++++++++ .../src/DmlExternalGpuAllocator.h | 22 +++++++++++++ .../src/DmlGpuAllocator.cpp | 5 +-- .../src/ExecutionProvider.cpp | 8 ++--- .../providers/dml/dml_provider_factory.cc | 5 ++- winml/lib/Api.Ort/OnnxruntimeEngine.cpp | 3 +- winml/test/common/SqueezeNetValidator.cpp | 4 +-- 11 files changed, 79 insertions(+), 24 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h index 6af7dac956560..a8f460b6d54d5 100644 --- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h +++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h @@ -36,8 +36,8 @@ extern "C" { * The OrtSessionOptionsAppendExecutionProvider_DML export on the OrtDmlApi should be used instead. * * Creates a DirectML Execution Provider which executes on the hardware adapter with the given device_id, also known as - * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by - * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the + * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by + * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the * primary display GPU installed on the system. A negative device_id is invalid. */ ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOptions* options, int device_id); @@ -49,8 +49,8 @@ ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOpti * * Creates a DirectML Execution Provider using the given DirectML device, and which executes work on the supplied D3D12 * command queue. The DirectML device and D3D12 command queue must have the same parent ID3D12Device, or an error will - * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this - * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue + * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this + * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue * objects. * See also: DMLCreateDevice * See also: ID3D12Device::CreateCommandQueue @@ -65,8 +65,8 @@ typedef struct OrtDmlApi OrtDmlApi; struct OrtDmlApi { /** * Creates a DirectML Execution Provider which executes on the hardware adapter with the given device_id, also known as - * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by - * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the + * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by + * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the * primary display GPU installed on the system. A negative device_id is invalid. */ ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOptions* options, int device_id); @@ -74,8 +74,8 @@ struct OrtDmlApi { /** * Creates a DirectML Execution Provider using the given DirectML device, and which executes work on the supplied D3D12 * command queue. The DirectML device and D3D12 command queue must have the same parent ID3D12Device, or an error will - * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this - * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue + * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this + * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue * objects. * See also: DMLCreateDevice * See also: ID3D12Device::CreateCommandQueue diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index 08e1221cb7977..7613bb456f6f9 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -151,9 +151,11 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA onnxruntime::OpenVINO_GPU, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast(id1)), id1, mem_type1); } else if (strcmp(name1, onnxruntime::DML) == 0) { + // Since EPs cannot have 2 allocators with the same OrtMemType and Memory ID, + // we use -1 as the memory ID to represent external allocations that don't have any allocator. *out = new OrtMemoryInfo( onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, static_cast(id1)), - id1, mem_type1); + -1, mem_type1); } else { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported."); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index c062ff81d1330..e31f59681b63f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -30,7 +30,6 @@ namespace Dml ID3D12CommandQueue* commandQueue, bool enableMetacommands = true); - ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h index c00b820434592..17ba37146bdc5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h @@ -14,7 +14,7 @@ namespace Dml DmlBfcAllocator(std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( - "DML", + onnxruntime::DML, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) ) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp new file mode 100644 index 0000000000000..0ebe2c3d00e5e --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "precomp.h" +#include "DmlExternalGpuAllocator.h" + +namespace Dml +{ + DmlExternalGpuAllocator::DmlExternalGpuAllocator() + : onnxruntime::IAllocator( + OrtMemoryInfo( + onnxruntime::DML, + OrtAllocatorType::OrtDeviceAllocator, + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, 0), + -1 + ) + ) {} + + void* DmlExternalGpuAllocator::Alloc(size_t size_in_bytes) + { + // This allocator should never be used to allocate memory; it should only be use to decode the opaque data pointer + THROW_HR(E_INVALIDARG); + } + + void DmlExternalGpuAllocator::Free(void* ptr) + { + // This allocator should never be used to free memory; it should only be use to decode the opaque data pointer + THROW_HR(E_INVALIDARG); + } + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h new file mode 100644 index 0000000000000..6c5ee8cd29c6e --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/framework/allocator.h" + +namespace Dml +{ + class BucketizedBufferAllocator; + class AllocationInfo; + struct TaggedPointer; + + class DmlExternalGpuAllocator : public onnxruntime::IAllocator + { + public: + DmlExternalGpuAllocator(); + + void* Alloc(size_t size_in_bytes) final; + void Free(void* ptr) final; + }; +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 5370515afffd1..5bee9ee34ec4d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -14,9 +14,10 @@ namespace Dml DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( - "DML", + onnxruntime::DML, OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0), + 0 ) ), m_bfcAllocator(bfcAllocator), diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 0a6859f7b615f..3c0874ba7e528 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -24,6 +24,7 @@ #include "DmlGpuAllocator.h" #include "DmlBuffer.h" #include "DmlTaggedPointer.h" +#include "DmlExternalGpuAllocator.h" #ifdef ERROR #undef ERROR @@ -89,6 +90,7 @@ namespace Dml InsertAllocator(m_impl->GetGpuAllocator()); InsertAllocator(m_impl->GetCpuInputAllocator()); InsertAllocator(m_impl->GetCpuOutputAllocator()); + InsertAllocator(std::make_shared()); } std::vector> @@ -980,12 +982,6 @@ namespace Dml return std::make_unique(dmlDevice, commandQueue, enableMetacommands); } - ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer) - { - Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource(); - } - void FlushContext(onnxruntime::IExecutionProvider* provider) { ExecutionProvider* dmlexecutionprovider = static_cast(provider); diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index ebcbae5e799de..784e0101197d2 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -20,6 +20,7 @@ using Microsoft::WRL::ComPtr; #include "core/framework/error_code_helper.h" #include "DmlExecutionProvider/src/ErrorHandling.h" #include "DmlExecutionProvider/src/GraphicsUnknownHelper.h" +#include "DmlExecutionProvider/src/DmlAllocationInfo.h" #include "DmlExecutionProvider/src/DmlTaggedPointer.h" #include "DmlExecutionProvider/inc/DmlExecutionProvider.h" #include "core/platform/env.h" @@ -218,8 +219,10 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc if (!allocator) { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available"); } - *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation)); + + *d3d_resource = static_cast(allocation)->GetUavResource(); (*d3d_resource)->AddRef(); + #else *d3d_resource = nullptr; #endif // USE_DML diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp index 905a3e6866f02..e294c91afc079 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp @@ -188,8 +188,7 @@ HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) { auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator); winrt::com_ptr resource; - RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, - resource.put()), + RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, resource.put()), ort_api); out = _winml::Resource(resource.get(), [](void*) { /*do nothing, as this pointer is actually a com pointer! */ }); } else { diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp index 3c85ef2c351dc..182e1dc806432 100644 --- a/winml/test/common/SqueezeNetValidator.cpp +++ b/winml/test/common/SqueezeNetValidator.cpp @@ -211,11 +211,11 @@ void ModelValidator::SqueezeNet( auto modulePath = FileHelpers::GetModulePath(); auto fullModelPath = modulePath + modelFileName; auto outputFileName = modulePath + outputDataFileName; - + // WinML model creation LearningModel model = nullptr; model = LearningModel::LoadFromFilePath(fullModelPath); - + LearningModelSession modelSession = nullptr; modelSession = LearningModelSession(model, LearningModelDevice(deviceKind)); From b9b3fb8e2836266e1d82d7d9dd0c5f39fda0ab5a Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 31 Jan 2023 14:34:44 -0800 Subject: [PATCH 26/76] WIP --- .../dml/DmlExecutionProvider/inc/DmlExecutionProvider.h | 1 + .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 6 ++++++ onnxruntime/core/providers/dml/dml_provider_factory.cc | 8 +++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index e31f59681b63f..c062ff81d1330 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -30,6 +30,7 @@ namespace Dml ID3D12CommandQueue* commandQueue, bool enableMetacommands = true); + ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 3c0874ba7e528..d5c74e8499f89 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -982,6 +982,12 @@ namespace Dml return std::make_unique(dmlDevice, commandQueue, enableMetacommands); } + ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer) + { + Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); + return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource(); + } + void FlushContext(onnxruntime::IExecutionProvider* provider) { ExecutionProvider* dmlexecutionprovider = static_cast(provider); diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 784e0101197d2..9a3cc3f739356 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -220,7 +220,13 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available"); } - *d3d_resource = static_cast(allocation)->GetUavResource(); + if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { + *d3d_resource = static_cast(allocation)->GetUavResource(); + } else { + ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT); + *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation)); + } + (*d3d_resource)->AddRef(); #else From 385480786825745c06f7e72ba0d481139e5893b1 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 31 Jan 2023 20:42:02 -0800 Subject: [PATCH 27/76] WIP --- .../core/providers/dml/dml_provider_factory.h | 12 ++++++ .../inc/DmlExecutionProvider.h | 3 +- .../src/ExecutionProvider.cpp | 4 +- .../providers/dml/dml_provider_factory.cc | 37 ++++++++++++++++++- winml/adapter/winml_adapter_dml.cpp | 2 +- .../Api.Image/TensorToVideoFrameConverter.cpp | 29 +++++++++------ .../Api.Image/VideoFrameToTensorConverter.cpp | 19 +++++----- .../inc/TensorToVideoFrameConverter.h | 7 +++- .../inc/VideoFrameToTensorConverter.h | 5 ++- winml/lib/Api.Ort/OnnxruntimeEngine.cpp | 9 +++-- winml/lib/Api.Ort/OnnxruntimeEngine.h | 2 +- winml/lib/Api/ImageFeatureValue.cpp | 34 ++++++++++------- winml/lib/Api/impl/TensorBase.h | 14 +++---- winml/lib/Common/inc/iengine.h | 2 +- 14 files changed, 123 insertions(+), 56 deletions(-) diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h index a8f460b6d54d5..47b6e53bed0a0 100644 --- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h +++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h @@ -100,6 +100,18 @@ struct OrtDmlApi { * This API gets the D3D12 resource when an OrtValue has been allocated by the DML EP. */ ORT_API2_STATUS(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* provider, _In_ void* dml_resource, _Out_ ID3D12Resource** d3d_resource); + + /** + * GetD3D12ResourceRegionFromAllocation + * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP. + * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed + */ + ORT_API2_STATUS(GetD3D12ResourceRegionFromAllocation, + _In_ OrtAllocator* provider, + _In_ void* dml_resource, + _In_ uint64_t size_in_bytes, + _Out_ ID3D12Resource** d3d_resource, + _Out_ uint64_t* offset); }; #ifdef __cplusplus diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index c062ff81d1330..7e1d9f80038d1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -7,6 +7,7 @@ interface IMLOperatorRegistry; #include "core/common/status.h" #include "core/framework/data_transfer.h" #include "IWinmlExecutionProvider.h" +#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h" namespace onnxruntime { @@ -30,7 +31,7 @@ namespace Dml ID3D12CommandQueue* commandQueue, bool enableMetacommands = true); - ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer); + D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t size_in_bytes); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index d5c74e8499f89..d5f905819d0b9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -982,10 +982,10 @@ namespace Dml return std::make_unique(dmlDevice, commandQueue, enableMetacommands); } - ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer) + D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t sizeInBytes) { Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource(); + return pAllocationInfo->CreateBufferRegion(taggedPointer, sizeInBytes); } void FlushContext(onnxruntime::IExecutionProvider* provider) diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 9a3cc3f739356..2545c85d5fb14 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -220,17 +220,49 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available"); } + // This should never happen since external users of the ORT API should only be able to create DML_EXTERNAL memory + if (wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DML_EXTERNAL) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The resource has been allocated with "); + } + + *d3d_resource = static_cast(allocation)->GetUavResource(); + (*d3d_resource)->AddRef(); + +#else + *d3d_resource = nullptr; +#endif // USE_DML + return nullptr; + API_IMPL_END +} + +ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation, + _In_ OrtAllocator* ort_allocator, + _In_ void* allocation, + _In_ uint64_t size_in_bytes, + _Out_ ID3D12Resource** d3d_resource, + _Out_ uint64_t* offset) { + API_IMPL_BEGIN +#ifdef USE_DML + auto wrapping_allocator = static_cast(ort_allocator); + auto allocator = wrapping_allocator->GetWrappedIAllocator(); + if (!allocator) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available"); + } + if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { *d3d_resource = static_cast(allocation)->GetUavResource(); + *offset = 0; } else { ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT); - *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation)); + auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation), size_in_bytes); + *d3d_resource = bufferRegion.ResourceInUavState(); } (*d3d_resource)->AddRef(); #else *d3d_resource = nullptr; + *offset = 0; #endif // USE_DML return nullptr; API_IMPL_END @@ -241,7 +273,8 @@ static constexpr OrtDmlApi ort_dml_api_10_to_x = { &OrtSessionOptionsAppendExecutionProviderEx_DML, &CreateGPUAllocationFromD3DResource, &FreeGPUAllocation, - &GetD3D12ResourceFromAllocation + &GetD3D12ResourceFromAllocation, + &GetD3D12ResourceRegionFromAllocation, }; const OrtDmlApi* GetOrtDmlApi(_In_ uint32_t /*version*/) NO_EXCEPTION { diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp index acda0d332b1cb..f3ffda496530f 100644 --- a/winml/adapter/winml_adapter_dml.cpp +++ b/winml/adapter/winml_adapter_dml.cpp @@ -89,7 +89,7 @@ ORT_API_STATUS_IMPL(winmla::OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ // lifetime and can be large, so shouldn't be rounded. // So we create the provider with rounding disabled, and expect the caller to enable it after. onnxruntime::DmlConfigureProviderFactoryDefaultRoundingMode(factory, AllocatorRoundingMode::Disabled); - + onnxruntime::DmlConfigureProviderFactoryMetacommandsEnabled(factory, metacommands_enabled); #endif // USE_DML return nullptr; diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp index b5d0becf638d0..f0a7c601f665e 100644 --- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp +++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp @@ -122,6 +122,7 @@ class ConvertCPUTensorToVideoFrameWithSoftwareBitmapTelemetryEvent { }; void TensorToVideoFrameConverter::DX12TensorToVideoFrame( + _In_ uint64_t inputTensorOffset, _In_ UINT32 batchIdx, _In_ winml::LearningModelSession& session, _In_ ID3D12Resource* pInputTensor, @@ -136,7 +137,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( wgi::SoftwareBitmap softwareBitmap = destVideoFrame.SoftwareBitmap(); if (softwareBitmap) { - ConvertGPUTensorToSoftwareBitmap(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap); + ConvertGPUTensorToSoftwareBitmap(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap); } else if (spDestDirect3DSurface) { bool isUAVSupportedFormat = _winmli::FormatSupportedForUAV( pDeviceCache->GetD3D12Device(), @@ -144,7 +145,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( // UAV support for formats is device dependent if (!isUAVSupportedFormat) { - ConvertDX12TensorToUnsupportedVideoFrameFormat(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame); + ConvertDX12TensorToUnsupportedVideoFrameFormat(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame); } else { ComPtr spVideoFrameTexture = _winmli::GetTextureFromDirect3DSurface(destVideoFrame.Direct3DSurface()); @@ -168,7 +169,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( } // Detensorize - ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()); + ConvertGPUTensorToDX12Texture(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()); // Make sure that detensorization is done SyncD3D12ToD3D11(*pDeviceCache, D3D11_cached_texture_.Get()); @@ -196,7 +197,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( } // Detensorize - ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()); + ConvertGPUTensorToDX12Texture(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()); // Make sure that detensorization is done SyncD3D12ToD3D11(*pDeviceCache, spSharedD3D11Texture.Get()); @@ -241,6 +242,7 @@ ComPtr TensorToVideoFrameConverter::CreateShareableD3D12Texture( } void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat( + _In_ uint64_t input_tensor_offset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputTensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -288,7 +290,7 @@ void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat converted_video_frame_ = wm::VideoFrame::CreateWithDirect3D11Surface(surface); // Detensorize - ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get()); + ConvertGPUTensorToDX12Texture(input_tensor_offset, batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get()); // Wait for the D3D12 work to complete before using the resource SyncD3D12ToD3D11(device_cache, spSharedD3D11Texture.Get()); @@ -387,6 +389,7 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame( } void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( + _In_ uint64_t inputTensorOffset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputResource, _In_ _winml::D3DDeviceCache& device_cache, @@ -460,7 +463,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( // Create SRV and UAV for input and output respectively { - D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(batchIdx, inputDesc, tensorDesc); + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(inputTensorOffset, batchIdx, inputDesc, tensorDesc); CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize); spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle); @@ -545,6 +548,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( } void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( + _In_ uint64_t inputTensorOffset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputTensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -579,7 +583,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pInputTensor, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE); command_list_->ResourceBarrier(1, &barrier); - command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, singleVideoFramebufferSize * batchIdx, singleVideoFramebufferSize); + command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, inputTensorOffset + singleVideoFramebufferSize * batchIdx, singleVideoFramebufferSize); WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; @@ -645,6 +649,7 @@ void TensorToVideoFrameConverter::ConvertBatchedDX12TensorToBuffers( } D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor( + uint64_t offset, const UINT32 batchIdx, const D3D12_RESOURCE_DESC& resourceDesc, const _winml::ImageTensorDescription& desc) { @@ -655,7 +660,7 @@ D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER; UINT singleImageSize = static_cast(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]); - srvDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; + srvDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; srvDesc.Buffer.NumElements = singleImageSize; srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE; @@ -736,10 +741,10 @@ void TensorToVideoFrameConverter::ConvertCPUTensorToSoftwareBitmap( if (tensorDesc.dataType == kImageTensorDataTypeFloat32) { WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize( tensorDesc.channelType, - targetChannelType, - tensorDesc.pixelRange, + targetChannelType, + tensorDesc.pixelRange, static_cast(pCPUTensor), - bufferWidth, + bufferWidth, height, width, pData)); @@ -754,4 +759,4 @@ void TensorToVideoFrameConverter::ConvertCPUTensorToSoftwareBitmap( width, pData)); } -} \ No newline at end of file +} diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 1215548d212c5..2ed8f04bbcb3b 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -188,6 +188,7 @@ ComPtr VideoFrameToTensorConverter::ShareD3D11Texture(ID3D11Text } void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( + _In_ uint64_t outputTensorOffset, _In_ const UINT32 batchIdx, _In_ winml::LearningModelSession& session, _In_ const wm::IVideoFrame& inputVideoFrame, @@ -206,7 +207,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( wgdx::Direct3D11::IDirect3DSurface spDirect3DSurface = inputVideoFrame.Direct3DSurface(); if (inputVideoFrame.SoftwareBitmap()) { - ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, pOutputTensor); + ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, outputTensorOffset, pOutputTensor); } else if (spDirect3DSurface) { ComPtr spVideoFrameTexture; wgi::BitmapBounds scaledBounds = inputBounds; @@ -278,7 +279,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( // We cropped the texture, shared it and converted it to a known color format, so it's time to tensorize // TODO: merge all videoframes to a single DX12Texture Resource before call ConvertDX12TextureToGPUTensor. - ConvertDX12TextureToGPUTensor(batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor); + ConvertDX12TextureToGPUTensor(outputTensorOffset, batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor); } else { // Invalid video frame WINML_THROW_IF_FAILED(E_INVALIDARG); @@ -286,6 +287,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( } void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( + _In_ uint64_t output_resource_offset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputResource, _In_ _winml::D3DDeviceCache& device_cache, @@ -339,7 +341,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( UINT64 ullTensorSize = 0; WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize)); - if (outputDesc.Width < ullTensorSize || + if (outputDesc.Width < output_resource_offset + ullTensorSize || outputDesc.Height != 1 || outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)) { @@ -381,7 +383,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize); spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle); - D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(batchIdx, outputDesc, tensorDesc); + D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(output_resource_offset, batchIdx, tensorDesc); CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), UavBufferIdx, srvUavDescriptorSize); spDx12Device->CreateUnorderedAccessView(pOutputResource, nullptr, &uavDesc, uavHandle); } @@ -458,6 +460,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( _In_ _winml::D3DDeviceCache& device_cache, _In_ const wgi::BitmapBounds& inputBounds, _In_ const ImageTensorDescription& tensorDesc, + _In_ uint64_t outputResourceOffset, _Inout_ ID3D12Resource* pOutputResource) { assert(pOutputResource != nullptr); assert(videoFrame.SoftwareBitmap() != nullptr); @@ -495,8 +498,6 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( assert(convertedSoftwareBitmap != nullptr); - D3D12_RESOURCE_DESC outputDesc = pOutputResource->GetDesc(); - uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; uint32_t bufferSize = static_cast(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize); @@ -526,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST); command_list_->ResourceBarrier(1, &barrier); - command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize); + command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), outputResourceOffset, bufferSize); WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; @@ -578,8 +579,8 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor( } D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescription( + uint64_t offset, const UINT32 batchIdx, - const D3D12_RESOURCE_DESC& resourceDesc, const _winml::ImageTensorDescription& desc) { UINT uiTensorElementSize = desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t); @@ -587,7 +588,7 @@ D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescripti D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; UINT singleImageSize = static_cast(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]); - uavDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; + uavDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; uavDesc.Buffer.NumElements = singleImageSize; uavDesc.Buffer.CounterOffsetInBytes = 0; uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE; diff --git a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h index b7b8333313054..8dac4cd9bd458 100644 --- a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h +++ b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h @@ -15,6 +15,7 @@ class TensorToVideoFrameConverter : public ImageConverter { // Function takes in a tensor DX12 Resource all compute ops should be completed // converts it to a VideoFrame backed by either a SoftwareBitmap or D3DSurface void DX12TensorToVideoFrame( + _In_ uint64_t inputTensorOffset, _In_ UINT32 batch_index, _In_ winml::LearningModelSession& session, _In_ ID3D12Resource* input_tensor, @@ -47,6 +48,7 @@ class TensorToVideoFrameConverter : public ImageConverter { Microsoft::WRL::ComPtr ShareD3D12Texture(ID3D12Resource* pResource, ID3D11Device* pDevice); void ConvertGPUTensorToSoftwareBitmap( + _In_ uint64_t inputTensorOffset, _In_ UINT32 batch_index, _In_ ID3D12Resource* input_tensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -54,6 +56,7 @@ class TensorToVideoFrameConverter : public ImageConverter { _Inout_ wgi::SoftwareBitmap& software_bitmap); void ConvertGPUTensorToDX12Texture( + _In_ uint64_t inputTensorOffset, _In_ UINT32 batch_index, _In_ ID3D12Resource* input_resource, _In_ _winml::D3DDeviceCache& device_cache, @@ -61,6 +64,7 @@ class TensorToVideoFrameConverter : public ImageConverter { _Inout_ ID3D12Resource* output_resource); void ConvertDX12TensorToUnsupportedVideoFrameFormat( + _In_ uint64_t input_tensor_offset, _In_ UINT32 batch_index, _In_ ID3D12Resource* input_tensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -68,6 +72,7 @@ class TensorToVideoFrameConverter : public ImageConverter { _Inout_ wm::VideoFrame& unsupported_video_frame); static D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor( + uint64_t offset, const UINT32 batch_index, const D3D12_RESOURCE_DESC& resource_description, const ImageTensorDescription& description); @@ -81,4 +86,4 @@ class TensorToVideoFrameConverter : public ImageConverter { const D3D11_TEXTURE2D_DESC& d3d11Desc, ID3D12Device* d3d12Device); }; -} // namespace _winml \ No newline at end of file +} // namespace _winml diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h index 4f0a010cc367f..e69f929936f2e 100644 --- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h +++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h @@ -21,6 +21,7 @@ class VideoFrameToTensorConverter : public ImageConverter { // {upperleft X, upperleft Y, width, height} to be turned into a tensor. // If the region of interest is the entire VideoFrame, the input BitmapBounds should describe the entire image. void VideoFrameToDX12Tensor( + _In_ uint64_t output_tensor_offset, _In_ const UINT32 batch_index, _In_ winml::LearningModelSession& session, _In_ const wm::IVideoFrame& input_video_frame, @@ -61,9 +62,11 @@ class VideoFrameToTensorConverter : public ImageConverter { _In_ _winml::D3DDeviceCache& device_cache, _In_ const wgi::BitmapBounds& input_bounds, _In_ const ImageTensorDescription& tensor_description, + _In_ uint64_t outputResourceOffset, _Inout_ ID3D12Resource* pOutputResource); void ConvertDX12TextureToGPUTensor( + _In_ uint64_t output_resource_offset, _In_ const UINT32 batch_index, _In_ ID3D12Resource* pInputResource, _In_ _winml::D3DDeviceCache& device_cache, @@ -71,8 +74,8 @@ class VideoFrameToTensorConverter : public ImageConverter { _Inout_ ID3D12Resource* output_resource); static D3D12_UNORDERED_ACCESS_VIEW_DESC CreateUAVDescription( + uint64_t offset, const UINT32 batch_index, - const D3D12_RESOURCE_DESC& resource_description, const ImageTensorDescription& description); static void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor( diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp index e294c91afc079..c285ca3646b1d 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp @@ -165,7 +165,7 @@ static auto GetStrings(const OrtApi* ort_api, const OrtValue* ort_value, return std::make_shared>(std::move(strings), std::move(buffer)); } -HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) { +HRESULT OnnxruntimeValue::GetResource(uint64_t size_in_bytes, _winml::Resource& out, uint64_t& offset) { auto ort_api = engine_->GetEngineFactory()->UseOrtApi(); void* mutable_data = nullptr; @@ -188,7 +188,7 @@ HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) { auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator); winrt::com_ptr resource; - RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, resource.put()), + RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceRegionFromAllocation(allocator.get(), mutable_data, size_in_bytes, resource.put(), &offset), ort_api); out = _winml::Resource(resource.get(), [](void*) { /*do nothing, as this pointer is actually a com pointer! */ }); } else { @@ -1296,10 +1296,11 @@ HRESULT OnnxruntimeEngine::FillFromMapValue(IInspectable* map, winml::TensorKind std::vector keys_shape; keys_value->GetTensorShape(keys_shape); + uint64_t offset = 0; _winml::Resource keys_data; - RETURN_IF_FAILED(keys_value->GetResource(keys_data)); + RETURN_IF_FAILED(keys_value->GetResource(0, keys_data, offset)); _winml::Resource values_data; - RETURN_IF_FAILED(values_value->GetResource(values_data)); + RETURN_IF_FAILED(values_value->GetResource(0, values_data, offset)); auto num_elements = static_cast(ShapeSize(keys_shape.data(), keys_shape.size())); GetAbiMapFiller(key_kind, value_kind)(map, num_elements, keys_data.get(), values_data.get()); diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h index 097941e78f1a5..7c53886da9821 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.h +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h @@ -32,7 +32,7 @@ class OnnxruntimeValue : public Microsoft::WRL::RuntimeClass< STDMETHOD(IsCpu) (bool* out) override; STDMETHOD(GetResource) - (_winml::Resource& resource) override; + (uint64_t size_in_bytes, _winml::Resource& resource, uint64_t& offset) override; STDMETHOD(IsTensor) (bool* out) override; STDMETHOD(IsOfTensorType) diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp index 1b8be103c0e5a..6622893b88a06 100644 --- a/winml/lib/Api/ImageFeatureValue.cpp +++ b/winml/lib/Api/ImageFeatureValue.cpp @@ -214,12 +214,12 @@ static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorK THROW_HR(E_NOTIMPL); } - if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 && + if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 && pixelRange != winml::LearningModelPixelRange::ZeroToOne && pixelRange != winml::LearningModelPixelRange::MinusOneToOne) { THROW_HR(E_NOTIMPL); } - + tensorDescription.pixelRange = pixelRange; tensorDescription.sizes[2] = height; tensorDescription.sizes[3] = width; @@ -275,6 +275,7 @@ static void GPUTensorize( _winml::ImageTensorDescription tensorDescriptor, com_ptr spSession, ID3D12Resource* d3dResource, + uint64_t resourceOffset, _winml::BindingContext& context) { auto spDevice = spSession->Device().as(); @@ -291,6 +292,7 @@ static void GPUTensorize( // Apply tensorization auto session = spSession.as(); pooledConverter->Get()->Tensorizer->VideoFrameToDX12Tensor( + resourceOffset, batchIdx, session, videoFrames.GetAt(batchIdx), @@ -417,7 +419,7 @@ std::optional ImageFeatureValue::GetIn } else { THROW_HR(WINML_ERR_INVALID_BINDING); } - + //NCHW layout auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), pixelRange.value(), m_batchSize, descriptorWidth, descriptorHeight); @@ -447,21 +449,23 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa winml::TensorKind::Float : winml::TensorKind::Float16, value.put())); + auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast(1), std::multiplies()); + auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize; + // Get the tensor raw data _winml::Resource void_resource; - RETURN_IF_FAILED(value->GetResource(void_resource)); + uint64_t offset = 0; + RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset)); if (context.type == _winml::BindingType::kInput) { // Only tensorize inputs - auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast(1), std::multiplies()); - auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize; auto singleFrameBufferSize = bufferByteSize / m_batchSize; if (spDevice->IsCpuDevice()) { auto resource = reinterpret_cast(void_resource.get()); CPUTensorize(m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, static_cast(singleFrameBufferSize)); } else { auto resource = reinterpret_cast(void_resource.get()); - GPUTensorize(m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, context); + GPUTensorize(m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, offset, context); } } @@ -481,14 +485,18 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont auto spSession = context.session.as(); auto spDevice = spSession->Device().as(); - // Get the output tensor raw data - _winml::Resource void_resource; - RETURN_IF_FAILED(value->GetResource(void_resource)); - // Get the run context auto metadata = GetInputMetadata(context); ImageResourceMetadata resourceMetadata = metadata.value(); + auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast(1), std::multiplies()); + auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize; + + // Get the output tensor raw data + _winml::Resource void_resource; + uint64_t offset = 0; + RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset)); + _winml::ConverterResourceDescription descriptor = {}; descriptor.width = static_cast(resourceMetadata.TensorDescriptor.sizes[3]); descriptor.height = static_cast(resourceMetadata.TensorDescriptor.sizes[2]); @@ -500,9 +508,6 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->DetensorizerStore()->Fetch(descriptor)); - auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast(1), std::multiplies()); - auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize; - BYTE* resource = reinterpret_cast(void_resource.get()); for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) { // Convert Software Tensor to VideoFrame one by one based on the buffer size. @@ -521,6 +526,7 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) { auto videoFrame = m_videoFrames.GetAt(batchIdx); pooledConverter->Get()->Detensorizer->DX12TensorToVideoFrame( + offset, batchIdx, context.session, d3dResource, diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h index 0d210eb2d7694..02c837c494a7e 100644 --- a/winml/lib/Api/impl/TensorBase.h +++ b/winml/lib/Api/impl/TensorBase.h @@ -148,7 +148,7 @@ struct TensorBase : TBase { // If there is no matching gpu resource, then fallback to a cpu resource if (CpuTensor() != nullptr) { - auto num_backing_buffers = CpuTensor()->num_buffers(); + auto num_backing_buffers = CpuTensor()->num_buffers(); if (num_backing_buffers == 1) { // If we have a single backing cpu buffer, there is no need to create GPU resources. // The engine will use the buffer provided, and perform the needed copies into the GPU context as needed. @@ -360,11 +360,13 @@ struct TensorBase : TBase { resources_, "The tensor has been closed and its resources have been detached during evaluation!"); - _winml::Resource updated_resource; - RETURN_IF_FAILED(value->GetResource(updated_resource)); - // get the shape RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!"); + auto buffer_size_in_bytes = static_cast(ShapeSize(shape_)) * sizeof(T); + + _winml::Resource updated_resource; + uint64_t offset = 0; + RETURN_IF_FAILED(value->GetResource(buffer_size_in_bytes, updated_resource, offset)); bool is_cpu; bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu; @@ -406,8 +408,6 @@ struct TensorBase : TBase { "Failed to prepare buffer for copy back from device resource."); RETURN_IF_FAILED(engine->CopyValueAcrossDevices(value, dest.get())); } else { - auto buffer_size_in_bytes = static_cast(ShapeSize(shape_)) * sizeof(T); - _winml::ConverterResourceDescription descriptor = {}; descriptor.pixel_format = static_cast(wgdx::DirectXPixelFormat::Unknown); descriptor.luid = device->GetD3DDevice()->GetAdapterLuid(); // Converted image on GPU @@ -526,7 +526,7 @@ struct TensorBase : TBase { } WINML_CATCH_ALL - + // ITensor::CreateFromBatchedBuffersInternal static typename TBase::class_type CreateFromBatchedBuffersInternal( std::vector shape, diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h index 1db8bc4568aac..a686b585841c3 100644 --- a/winml/lib/Common/inc/iengine.h +++ b/winml/lib/Common/inc/iengine.h @@ -19,7 +19,7 @@ IValue : IUnknown { (bool* out) PURE; STDMETHOD(GetResource) - (_winml::Resource & resource) PURE; + (uint64_t size_in_bytes, _winml::Resource & resource, uint64_t& offset) PURE; STDMETHOD(IsTensor) (bool* out) PURE; From 93d931b5cb4e16ac36bf7457c40698cc3ffe4696 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 1 Feb 2023 15:29:10 -0800 Subject: [PATCH 28/76] WIP --- .../providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 9dfc1672708c0..502bb187bb0db 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -65,6 +65,7 @@ namespace Dml if (persistentResourceSize > 0) { auto buffer = m_provider->AllocatePooledResource(persistentResourceSize); + m_persistentResource = buffer.ResourceInUavState(); m_persistentResourceBinding = buffer.GetBufferBinding(); m_managedPersistentBuffer = wil::MakeOrThrow(std::move(buffer)); m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get()); From 9c03955a2bd2d80f26afe1eddc6639623110680b Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 26 Apr 2023 16:23:42 -0700 Subject: [PATCH 29/76] Add hack to work around OOM errors with upload heaps --- .../src/PooledUploadHeap.cpp | 44 +++++-------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp index db5fd301cfdd0..442b3e7ddf746 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp @@ -99,36 +99,27 @@ namespace Dml auto heap = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD); auto buffer = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes); - ORT_THROW_IF_FAILED(device->CreateCommittedResource( + HRESULT hr = device->CreateCommittedResource( &heap, D3D12_HEAP_FLAG_NONE, &buffer, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, - IID_GRAPHICS_PPV_ARGS(uploadBuffer.ReleaseAndGetAddressOf()))); + IID_GRAPHICS_PPV_ARGS(uploadBuffer.ReleaseAndGetAddressOf())); + + if (hr == DXGI_ERROR_DEVICE_REMOVED) + { + ORT_THROW_IF_FAILED(device->GetDeviceRemovedReason()); + } + ORT_THROW_IF_FAILED(hr); return Chunk{ sizeInBytes, std::move(uploadBuffer) }; } std::pair PooledUploadHeap::Reserve(size_t sizeInBytes) { - // Try to find a chunk with enough free space to accommodate the requested allocation size - for (Chunk& chunk : m_chunks) - { - std::optional offsetForAllocation = FindOffsetForAllocation(chunk, sizeInBytes); - if (offsetForAllocation) - { - // There's enough space in this chunk - return - return std::make_pair(&chunk, *offsetForAllocation); - } - } - - // No chunks were able to accommodate the allocation - create a new chunk and return that instead - // At least double the capacity of the pool - const size_t newChunkSize = std::max({ m_totalCapacity, c_minChunkSize, sizeInBytes }); - m_chunks.push_back(CreateChunk(m_device.Get(), newChunkSize)); - m_totalCapacity += newChunkSize; + m_chunks.push_back(CreateChunk(m_device.Get(), sizeInBytes)); // Allocate from the beginning of the new chunk return std::make_pair(&m_chunks.back(), 0); @@ -206,13 +197,6 @@ namespace Dml return c.allocations.empty(); }); m_chunks.erase(it, m_chunks.end()); - - // Re-calculate total capacity - m_totalCapacity = 0; - for (const auto& chunk : m_chunks) - { - m_totalCapacity += chunk.capacityInBytes; - } } void PooledUploadHeap::AssertInvariants() @@ -224,7 +208,7 @@ namespace Dml }; // Chunks should be sorted by ascending capacity - assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer)); + // assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer)); // Allocations in a chunk should be sorted by ascending fence value for (const auto& chunk : m_chunks) @@ -270,14 +254,6 @@ namespace Dml } } - // Validate total capacity of pool - size_t calculatedCapacity = 0; - for (const auto& chunk : m_chunks) - { - calculatedCapacity += chunk.capacityInBytes; - } - assert(calculatedCapacity == m_totalCapacity); - #endif // #ifdef _DEBUG } } // namespace Dml From e34abafb9015f3c55c32e951ad96001658a02f81 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 5 Jul 2023 17:00:43 -0700 Subject: [PATCH 30/76] Fix DFT --- .../src/Operators/DmlDFT.h | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index c69fcdf035e21..33fef8c4c0d04 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -402,16 +402,6 @@ class GpuDFTOperator : public WRL::Base auto outputDims = GetTensorDimensions(outputTensor.Get()); ORT_THROW_HR_IF(E_FAIL, inputDims.size() != outputDims.size()); - ComPtr inputUnknown; - ComPtr inputResource; - inputTensor->GetDataInterface(inputUnknown.GetAddressOf()); - ORT_THROW_IF_FAILED(inputUnknown.As(&inputResource)); - - ComPtr outputUnknown; - ComPtr outputResource; - outputTensor->GetDataInterface(outputUnknown.GetAddressOf()); - ORT_THROW_IF_FAILED(outputUnknown.As(&outputResource)); - // Get optional dft_length input uint32_t dftLength = inputDims[onnxruntime::narrow(m_axis)]; ComPtr dftLengthTensor; @@ -685,8 +675,10 @@ class GpuDFTOperator : public WRL::Base // Padding should be handled by the shader. PrepareStockhamFFTParams( context, - inputBufferRegion, inputDims, - zChirpBufferRegion, params.BluesteinZChirpParams.AFFT.Sizes, + inputBufferRegion, + inputDims, + aFFTBufferRegion, + params.BluesteinZChirpParams.AFFT.Sizes, M, m_axis, 1, @@ -698,8 +690,10 @@ class GpuDFTOperator : public WRL::Base // Therefore the window function logic shold hangle complex multiplication, and B_FTT should be used like a window function. PrepareStockhamFFTParams( context, - zChirpBufferRegion, params.BluesteinZChirpParams.AFFT.Sizes, - outputBufferRegion, outputDims, + aFFTBufferRegion, + params.BluesteinZChirpParams.AFFT.Sizes, + outputBufferRegion, + outputDims, M, 1, m_axis, @@ -715,8 +709,10 @@ class GpuDFTOperator : public WRL::Base // The BFFT call takes input B, and produces output B_FFT. PrepareStockhamFFTParams( context, - bBufferRegion, params.BluesteinZChirpParams.B.Sizes, - bFFTBufferRegion, params.BluesteinZChirpParams.BFFT.Sizes, + bBufferRegion, + params.BluesteinZChirpParams.B.Sizes, + bFFTBufferRegion, + params.BluesteinZChirpParams.BFFT.Sizes, M, 2, 2, From 00708a6a34cf6e02a1906d053093ea11ed9065f3 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 5 Jul 2023 18:56:24 -0700 Subject: [PATCH 31/76] Register external allocator --- .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 3 ++- .../providers/dml/DmlExecutionProvider/src/ExecutionProvider.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 871551597f468..33c2a99d19e79 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -212,9 +212,10 @@ namespace Dml m_context->SetAllocator(m_gpuAllocator); // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators. m_cpuInputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUInput); + m_externalGpuAllocator = std::make_shared(); } - return std::vector{m_gpuAllocator, m_cpuInputAllocator,}; + return std::vector{m_gpuAllocator, m_externalGpuAllocator, m_cpuInputAllocator}; } HRESULT __stdcall ExecutionProviderImpl::GetD3DDevice(_COM_Outptr_ ID3D12Device** d3dDevice) const noexcept diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 1ddf1ca5dfe30..a959042dab32c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -28,6 +28,7 @@ namespace Dml class DmlCpuAllocator; class ExecutionProvider; class DmlGpuAllocator; + class DmlExternalGpuAllocator; struct TaggedPointer; class ExecutionProviderImpl : public WRL::Base m_subAllocator; std::shared_ptr m_bfcAllocator; std::shared_ptr m_gpuAllocator; + std::shared_ptr m_externalGpuAllocator; std::shared_ptr m_cpuInputAllocator; std::shared_ptr m_kernelRegistry; std::shared_ptr m_internalRegInfoMap; From 9927336dd4526b3a346b6be8b98b9ee59eda6b70 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 6 Jul 2023 08:54:54 -0700 Subject: [PATCH 32/76] Fix DFT and STFT --- .../src/Operators/DmlDFT.h | 14 ++--- .../src/Operators/DmlSTFT.h | 59 ++++++++----------- 2 files changed, 30 insertions(+), 43 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index 33fef8c4c0d04..ed1a6ebe49171 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -838,15 +838,6 @@ class GpuDFTOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[TSize+1]; - - std::transform( - bufferRegions.begin(), bufferRegions.end(), - uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); - uav_barriers[TSize] = CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr); - commandList->ResourceBarrier(TSize, uav_barriers); - for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -895,7 +886,10 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(3, uav_barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index fc7684242b290..cd1f78e2a23a6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -192,9 +192,7 @@ class DmlSTFTOperator : public WRL::Base ComPtr descriptorHeap; ComPtr bindingTable; ComPtr commandRecorder; - ComPtr persistentResource; - ComPtr persistentResourcePoolingUnk; - std::optional persistentResourceBinding; + std::optional persistentBufferRegion; bool hasWindowTensor = false; uint64_t signalBufferSizeInBytes = 0; uint64_t windowBufferSizeInBytes = 0; @@ -315,20 +313,29 @@ class DmlSTFTOperator : public WRL::Base // Initialize { + std::vector initializationInputBindings(params.hasWindowTensor ? 2 : 1); + uint64_t persistentResourceSize = m_framingOperator.op->GetBindingProperties().PersistentResourceSize; if (persistentResourceSize > 0) { - auto buffer = m_dmlProvider->AllocatePooledResource(persistentResourceSize); - m_framingOperator.persistentResource = buffer.ResourceInUavState(); - m_framingOperator.persistentResourceBinding = buffer.GetBufferBinding(); + m_framingOperator.persistentBufferRegion = m_dmlProvider->AllocatePooledResource(persistentResourceSize); + auto binding = m_framingOperator.persistentBufferRegion->GetBufferBinding(); + ORT_THROW_IF_FAILED(m_dmlProvider->InitializeOperator( + m_framingOperator.op.Get(), + &binding, + gsl::make_span(initializationInputBindings) + )); + } + else + { + ORT_THROW_IF_FAILED(m_dmlProvider->InitializeOperator( + m_framingOperator.op.Get(), + nullptr, + gsl::make_span(initializationInputBindings) + )); } - std::vector initializationInputBindings(params.hasWindowTensor ? 2 : 1); - ORT_THROW_IF_FAILED(m_dmlProvider->InitializeOperator( - m_framingOperator.op.Get(), - m_framingOperator.persistentResourceBinding ? &*m_framingOperator.persistentResourceBinding : nullptr, - gsl::make_span(initializationInputBindings) - )); + } auto execBindingProps = m_framingOperator.op->GetBindingProperties(); @@ -398,11 +405,6 @@ class DmlSTFTOperator : public WRL::Base std::array inputBindings; uint32_t inputBindingsCount = 1; - // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking - // barrierCount is outside the valid range. - D3D12_RESOURCE_BARRIER barriers[3]; - uint32_t barrierCount = 0; - Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; @@ -435,31 +437,22 @@ class DmlSTFTOperator : public WRL::Base auto persistentBufferSize = bindingProps.PersistentResourceSize; if (persistentBufferSize > 0) { - DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &*m_framingOperator.persistentResourceBinding }; + assert(m_framingOperator.persistentBufferRegion.has_value()); + auto persistentResourceBinding = m_framingOperator.persistentBufferRegion->GetBufferBinding(); + DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &persistentResourceBinding }; m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc); } - // Transition resources COMMON -> UAV - D3D12_RESOURCE_BARRIER uav_barriers[4]; - uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(signalBufferRegion.ResourceInUavState()); - uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(windowBufferRegion.ResourceInUavState()); - uav_barriers[2] = CD3DX12_RESOURCE_BARRIER::UAV(outputBufferRegion.ResourceInUavState()); - uav_barriers[3] = CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr); - commandList->ResourceBarrier(barrierCount, barriers); - m_framingOperator.commandRecorder->RecordDispatch( commandList, m_framingOperator.op.Get(), m_framingOperator.bindingTable.Get() ); - // Transition resources UAV -> COMMON - for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++) - { - std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter); - } - - commandList->ResourceBarrier(barrierCount, barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; From c20690fee00160e5a8a39374605e8574cbf983a9 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 6 Jul 2023 09:47:49 -0700 Subject: [PATCH 33/76] Grid sample --- .../src/Operators/DmlGridSample.h | 125 ++++-------------- 1 file changed, 29 insertions(+), 96 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index c63863853fb4e..8863bd5362d27 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -4,6 +4,7 @@ #include "../MLOperatorAuthorImpl.h" #include "../External/D3DX12/d3dx12.h" +#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h" #include // NOTE: When this operator's implementation is moved into DML, the associated FP16 fallback @@ -329,15 +330,6 @@ class DmlGridSampleOperator : public WRL::Base ComPtr m_gridSamplePipelineState; DmlGridSampleParameters m_params = {}; - - // Allocate temporary buffers if needed - struct ResourceDesc - { - ComPtr Resource; - std::array Sizes; - std::array Strides; - }; - struct GridSampleShaderConstants { uint32_t StartIndex; @@ -623,29 +615,18 @@ class DmlGridSampleOperator : public WRL::Base auto gridDims = GetTensorDimensions(gridTensor.Get()); auto outputDims = GetTensorDimensions(outputTensor.Get()); - ComPtr inputUnknown; - ComPtr inputResource; - inputTensor->GetDataInterface(inputUnknown.GetAddressOf()); - ORT_THROW_IF_FAILED(inputUnknown.As(&inputResource)); - - ComPtr gridUnknown; - ComPtr gridResource; - gridTensor->GetDataInterface(gridUnknown.GetAddressOf()); - ORT_THROW_IF_FAILED(gridUnknown.As(&gridResource)); - - ComPtr outputUnknown; - ComPtr outputResource; - outputTensor->GetDataInterface(outputUnknown.GetAddressOf()); - ORT_THROW_IF_FAILED(outputUnknown.As(&outputResource)); + auto inputTensorWrapper = static_cast(inputTensor.Get()); + auto gridTensorWrapper = static_cast(gridTensor.Get()); + auto outputTensorWrapper = static_cast(outputTensor.Get()); return Compute( commandList.Get(), context, - inputResource.Get(), + inputTensorWrapper->GetBufferRegion(), inputDims, - gridResource.Get(), + gridTensorWrapper->GetBufferRegion(), gridDims, - outputResource.Get(), + outputTensorWrapper->GetBufferRegion(), outputDims ); } @@ -660,21 +641,21 @@ class DmlGridSampleOperator : public WRL::Base HRESULT Compute( ID3D12GraphicsCommandList* commandList, IMLOperatorKernelContext* context, - ID3D12Resource* inputResource, + const Dml::D3D12BufferRegion& inputBufferRegion, gsl::span inputDims, - ID3D12Resource* gridResource, + const Dml::D3D12BufferRegion& gridBufferRegion, gsl::span gridDims, - ID3D12Resource* outputResource, + const Dml::D3D12BufferRegion& outputBufferRegion, gsl::span outputDims) { try { GridSample( - inputResource, + inputBufferRegion, inputDims, - gridResource, + gridBufferRegion, gridDims, - outputResource, + outputBufferRegion, outputDims, commandList); } @@ -687,11 +668,11 @@ class DmlGridSampleOperator : public WRL::Base } void GridSample( - ID3D12Resource* inputResource, + const Dml::D3D12BufferRegion& inputBufferRegion, gsl::span inputDims, - ID3D12Resource* gridResource, + const Dml::D3D12BufferRegion& gridBufferRegion, gsl::span gridDims, - ID3D12Resource* outputResource, + const Dml::D3D12BufferRegion& outputBufferRegion, gsl::span outputDims, ID3D12GraphicsCommandList* commandList) { @@ -702,33 +683,6 @@ class DmlGridSampleOperator : public WRL::Base Dml::GetDescendingPackedStrides(gridDims, gridStrides); Dml::GetDescendingPackedStrides(outputDims, outputStrides); - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[3]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputResource, - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridResource, - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputResource, - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - inputResource->SetName(L"InputResource"); - outputResource->SetName(L"OutputResource"); - gridResource->SetName(L"GridResource"); - - commandList->ResourceBarrier(3, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get()); commandList->SetPipelineState(m_gridSamplePipelineState.Get()); @@ -747,29 +701,13 @@ class DmlGridSampleOperator : public WRL::Base std::copy(outputStrides.begin(), outputStrides.end(), constants.OutputStrides); constants.ElementCount = ComputeElementCountFromDimensions(constants.OutputSizes); - std::array uav_resources = { inputResource, gridResource, outputResource }; - Dispatch(uav_resources, constants, commandList); - - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputResource, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridResource, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); + std::array uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion }; + Dispatch(uavBufferRegions, constants, commandList); - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputResource, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(3, barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -782,25 +720,17 @@ class DmlGridSampleOperator : public WRL::Base template void Dispatch( - std::array& resources, + std::array& bufferRegions, TConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[TSize]; - - std::transform( - resources.begin(), resources.end(), - uav_barriers, - [](auto& resource) { return CD3DX12_RESOURCE_BARRIER::UAV(resource); } ); - commandList->ResourceBarrier(TSize, uav_barriers); - for (uint32_t i = 0; i < TSize; i++) { // Set resource views - if (resources[i]) { + if (bufferRegions[i]) { commandList->SetComputeRootUnorderedAccessView( i, // root parameter index - resources[i]->GetGPUVirtualAddress() + bufferRegions[i].ResourceInUavState()->GetGPUVirtualAddress() + bufferRegions[i].Offset() ); } else @@ -842,7 +772,10 @@ class DmlGridSampleOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(2, uav_barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; From 0bb51245fcb75467e151daf30dc4c5193e9707f8 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 6 Jul 2023 18:31:34 -0700 Subject: [PATCH 34/76] Fix WinML API --- onnxruntime/core/providers/dml/dml_provider_factory.cc | 1 + winml/lib/Api.Image/VideoFrameToTensorConverter.cpp | 2 +- winml/lib/Api/LearningModelBinding.cpp | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 2545c85d5fb14..91279be185ba9 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -255,6 +255,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation, } else { ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT); auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation), size_in_bytes); + *offset = bufferRegion.Offset(); *d3d_resource = bufferRegion.ResourceInUavState(); } diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 00aff0b740dd4..c223f44f1282b 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -527,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST); command_list_->ResourceBarrier(1, &barrier); - command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), outputResourceOffset, bufferSize); + command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx + outputResourceOffset, upload_heap_.Get(), 0, bufferSize); WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; diff --git a/winml/lib/Api/LearningModelBinding.cpp b/winml/lib/Api/LearningModelBinding.cpp index 0c79117a345dd..dda127298530b 100644 --- a/winml/lib/Api/LearningModelBinding.cpp +++ b/winml/lib/Api/LearningModelBinding.cpp @@ -138,7 +138,7 @@ std::tuple, _winml::BindingType> Lea // Hold onto the input output providers so that our memory doesnt get destroyed! auto providerInfo = ProviderInfo{inspectable, spLotusValueProvider, context}; CacheProvider(name, providerInfo); - + return std::make_tuple(name, value, bindingType); } @@ -480,7 +480,7 @@ STDMETHODIMP LearningModelBinding::Bind( auto session = m_session.as(); auto device = m_session.Device().as(); CWinMLAutoLock lock(!device->IsCpuDevice() ? session->GetDMLEPLock() : nullptr); - + _winmlt::TelemetryEvent binding_event(_winmlt::EventCategory::kBinding); _winml::BindingType binding_type; std::string binding_name; @@ -613,4 +613,4 @@ void LearningModelBinding::BindUnboundOutputs() { } } -} // namespace WINMLP \ No newline at end of file +} // namespace WINMLP From 6bc50491a28d7838a90c06355a7ca0235923f4e8 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 7 Jul 2023 09:04:29 -0700 Subject: [PATCH 35/76] Fix ImageTests.SynchronizeGPUWorkloads test failure --- .../dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp | 3 +++ winml/lib/Api.Image/VideoFrameToTensorConverter.cpp | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index e2ae8f23a3744..9b39fe9758876 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -395,6 +395,9 @@ namespace Dml size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; + // Make sure the region we're trying to create fits entirely in the resource + assert(it->second->GetUavResource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes); + return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index c223f44f1282b..5536f7df203b7 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -588,7 +588,7 @@ D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescripti D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; UINT singleImageSize = static_cast(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]); - uavDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; + uavDesc.Buffer.FirstElement = offset / uiTensorElementSize + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; uavDesc.Buffer.NumElements = singleImageSize; uavDesc.Buffer.CounterOffsetInBytes = 0; uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE; From 14d1c9658095898cc6cace897915b71601c79d81 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 7 Jul 2023 19:21:07 -0700 Subject: [PATCH 36/76] Fix ConcurrencyTests.MultiThreadSingleSessionGpu --- .../src/BucketizedBufferAllocator.cpp | 3 +-- .../DmlExecutionProvider/src/BucketizedBufferAllocator.h | 2 +- .../dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp | 2 +- .../dml/DmlExecutionProvider/src/DmlAllocationInfo.h | 8 -------- .../dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp | 2 +- .../dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp | 6 ++++++ .../dml/DmlExecutionProvider/src/DmlTaggedPointer.h | 1 + .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 6 ++---- 8 files changed, 13 insertions(+), 17 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 9b39fe9758876..5c488e4376733 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -289,7 +289,6 @@ namespace Dml ComPtr allocInfo = wil::MakeOrThrow( this, ++m_currentAllocationId, - ++m_currentResourceId, resourceWrapper.Get(), size_in_bytes ); @@ -327,7 +326,7 @@ namespace Dml allocations_by_id_.erase(it); } - void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId) + void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo) { // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already // close at this point if the application is winding down. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 5af97b1cb53c7..73e7a0a317984 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -66,7 +66,7 @@ namespace Dml AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); - void FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId); + void FreeResource(AllocationInfo* allocInfo); uint64_t ComputeRequiredSize(size_t size); bool TilingEnabled() const { return tiling_enabled_; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp index 044e9e854d700..a9560c0bd3c9a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp @@ -12,7 +12,7 @@ namespace Dml { if (m_owner) { - m_owner->FreeResource(this, m_pooledResourceId); + m_owner->FreeResource(this); } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 977de7c4887e2..492f87c77f1d0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -16,12 +16,10 @@ namespace Dml AllocationInfo( BucketizedBufferAllocator* owner, size_t id, - uint64_t pooledResourceId, DmlResourceWrapper* resourceWrapper, size_t requestedSize) : m_owner(owner) , m_allocationId(id) - , m_pooledResourceId(pooledResourceId) , m_resourceWrapper(resourceWrapper) , m_requestedSize(requestedSize) {} @@ -63,15 +61,9 @@ namespace Dml return m_allocationId; } - uint64_t GetPooledResourceId() const - { - return m_pooledResourceId; - } - private: BucketizedBufferAllocator* m_owner; size_t m_allocationId; // For debugging purposes - uint64_t m_pooledResourceId = 0; Microsoft::WRL::ComPtr m_resourceWrapper; // The size requested during Alloc(), which may be smaller than the physical resource size diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index e7d5b9d13fd78..cd6b241e70d48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -102,7 +102,7 @@ namespace DmlGraphFusionHelper { // The allocation is not pooled auto allocInfo = static_cast(opaqueData); - *allocId = allocInfo->GetPooledResourceId(); + *allocId = 0; return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp index da5ed6df2ff4c..8f503566768a1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp @@ -38,4 +38,10 @@ namespace Dml return reinterpret_cast(ptr); } + +uint64_t TaggedPointer::GetUniqueId() const +{ + return reinterpret_cast(TaggedPointer::Pack(device_id, allocation_id, offset)); +} + } // namespace tfdml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h index 96b0eb318ad48..ee58e23a6396f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h @@ -29,6 +29,7 @@ struct TaggedPointer uint32_t allocation_id, uint64_t offset); static TaggedPointer Unpack(const void* ptr); + uint64_t GetUniqueId() const; }; static_assert( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 33c2a99d19e79..831266cdd5bff 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -1040,7 +1040,7 @@ namespace Dml uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) { assert(!isInternalOperator); - return m_gpuAllocator->GetAllocationInfo(taggedPointer)->GetPooledResourceId(); + return taggedPointer.GetUniqueId(); } void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState( @@ -1193,12 +1193,10 @@ namespace Dml void* CreateGPUAllocationFromD3DResource(ID3D12Resource* pResource) { - uint64_t pooledResourceId = 0; // Not a pooled resource - ComPtr resourceWrapper; wil::MakeOrThrow(pResource).As(&resourceWrapper); - ComPtr allocInfo = wil::MakeOrThrow(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width); + ComPtr allocInfo = wil::MakeOrThrow(nullptr, 0, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width); return allocInfo.Detach(); } void FreeGPUAllocation(void* ptr) From a2809af25df6f81618a112af20597056e3d31205 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 11 Jul 2023 06:51:00 -0700 Subject: [PATCH 37/76] Add print statements for CopyBufferRegion --- .../dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index af625334b7720..ab4d4f29abed9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -173,6 +173,7 @@ void DmlCommandRecorder::CopyBufferRegion( uint64_t srcOffset, uint64_t byteCount) { + printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); m_currentCommandList->CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount); m_operationsRecordedInCurrentCommandList = true; } From 2f8bff801822ff0d21263f29d691f79099ecfadd Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 11 Jul 2023 06:52:48 -0700 Subject: [PATCH 38/76] Add print statements for CopyBufferRegion --- .../dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp | 1 - .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index ab4d4f29abed9..af625334b7720 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -173,7 +173,6 @@ void DmlCommandRecorder::CopyBufferRegion( uint64_t srcOffset, uint64_t byteCount) { - printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); m_currentCommandList->CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount); m_operationsRecordedInCurrentCommandList = true; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 831266cdd5bff..afe90d9a25f1a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -476,6 +476,8 @@ namespace Dml } else if (!src->IsCpuData() && !dst->IsCpuData()) { + printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); + // // GPU -> GPU copy // From c024d0ad0e37d819bc248e40b6b50762ae73b5ff Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 11 Jul 2023 10:38:13 -0700 Subject: [PATCH 39/76] Use Identity for the copy operator --- .../src/Operators/DmlOperatorCopy.cpp | 28 ++++--------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index 4ca51633d23e7..f8ef496b74d9b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -29,30 +29,14 @@ class DmlOperatorCopy : public DmlOperator ComPtr contextPrivate; ORT_THROW_IF_FAILED(kernelInfo.GetInterface()->QueryInterface(contextPrivate.GetAddressOf())); - if (contextPrivate->IsDmlGraphNode()) - { - std::vector inputDescs = GetDmlInputDescs(); - std::vector outputDescs = GetDmlOutputDescs(); + std::vector inputDescs = GetDmlInputDescs(); + std::vector outputDescs = GetDmlOutputDescs(); - DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {}; - opDesc.InputTensor = inputDescs.data(); - opDesc.OutputTensor = outputDescs.data(); + DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {}; + opDesc.InputTensor = inputDescs.data(); + opDesc.OutputTensor = outputDescs.data(); - SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo); - } - } - - void Compute(const MLOperatorKernelContext& kernelContext) - { - MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); - - // Reshape the output tensor. - MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); - - // Copy elements from input tensor to output tensor. - ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( - outputTensor.GetInterface().Get(), - inputTensor.GetInterface().Get())); + SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo); } }; From fef7df27290799bf33a480c054123e3ff25cdd82 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 11 Jul 2023 14:37:45 -0700 Subject: [PATCH 40/76] Add intermediate buffer for copying --- .../core/providers/dml/dml_provider_factory.h | 18 ++--- .../DmlExecutionProvider/src/CommandQueue.cpp | 14 ++-- .../src/DmlAllocationInfo.cpp | 2 +- .../src/DmlAllocationInfo.h | 8 +- .../src/DmlBfcAllocator.h | 6 +- .../src/DmlCommandRecorder.h | 2 +- .../src/DmlExternalGpuAllocator.h | 2 +- .../src/DmlGpuAllocator.cpp | 4 +- .../src/DmlGpuAllocator.h | 6 +- ...pp => DmlReservedResourceSubAllocator.cpp} | 36 ++++----- ...or.h => DmlReservedResourceSubAllocator.h} | 18 ++--- .../src/ExecutionContext.cpp | 73 +++++++++++++++++++ .../src/ExecutionProvider.cpp | 4 +- .../src/ExecutionProvider.h | 4 +- 14 files changed, 135 insertions(+), 62 deletions(-) rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{BucketizedBufferAllocator.cpp => DmlReservedResourceSubAllocator.cpp} (90%) rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{BucketizedBufferAllocator.h => DmlReservedResourceSubAllocator.h} (92%) diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h index cbb6c6d2c9198..2ec3a10b08aed 100644 --- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h +++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h @@ -101,16 +101,16 @@ struct OrtDmlApi { ORT_API2_STATUS(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* provider, _In_ void* dml_resource, _Out_ ID3D12Resource** d3d_resource); /** - * GetD3D12ResourceRegionFromAllocation - * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP. - * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed - */ + * GetD3D12ResourceRegionFromAllocation + * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP. + * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed + */ ORT_API2_STATUS(GetD3D12ResourceRegionFromAllocation, - _In_ OrtAllocator* provider, - _In_ void* dml_resource, - _In_ uint64_t size_in_bytes, - _Out_ ID3D12Resource** d3d_resource, - _Out_ uint64_t* offset); + _In_ OrtAllocator* provider, + _In_ void* dml_resource, + _In_ uint64_t size_in_bytes, + _Out_ ID3D12Resource** d3d_resource, + _Out_ uint64_t* offset); }; #ifdef __cplusplus diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp index 5516fc62cdda0..e5084772d4063 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp @@ -46,12 +46,12 @@ namespace Dml return GpuEvent{ m_lastFenceValue + 1, m_fence }; } - void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) + void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) { - // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK - // to queue additional references at this time, since those references would be leaked. This - // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; - // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference + // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK + // to queue additional references at this time, since those references would be leaked. This + // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; + // for example, an allocation from DmlReservedResourceSubAllocator attempts to queue a reference // to its underlying D3D resource when freed. Furthermore, these references are unnecessary // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences. if (!m_closing) @@ -68,7 +68,7 @@ namespace Dml m_queuedReferences.push_back(queuedReference); } } - + void CommandQueue::Close() { // Wait for flushed work: @@ -79,7 +79,7 @@ namespace Dml m_queuedReferences.clear(); m_closing = false; } - + void CommandQueue::ReleaseCompletedReferences() { uint64_t completedValue = GetFence()->GetCompletedValue(); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp index a9560c0bd3c9a..52944d2c8b96a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp @@ -3,7 +3,7 @@ #include "precomp.h" #include "DmlAllocationInfo.h" -#include "BucketizedBufferAllocator.h" +#include "DmlReservedResourceSubAllocator.h" namespace Dml { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 492f87c77f1d0..546a42342a2a0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -7,14 +7,14 @@ namespace Dml { - class BucketizedBufferAllocator; + class DmlReservedResourceSubAllocator; class AllocationInfo : public Microsoft::WRL::RuntimeClass< Microsoft::WRL::RuntimeClassFlags, IUnknown> { public: AllocationInfo( - BucketizedBufferAllocator* owner, + DmlReservedResourceSubAllocator* owner, size_t id, DmlResourceWrapper* resourceWrapper, size_t requestedSize) @@ -26,7 +26,7 @@ namespace Dml ~AllocationInfo(); - BucketizedBufferAllocator* GetOwner() const + DmlReservedResourceSubAllocator* GetOwner() const { return m_owner; } @@ -62,7 +62,7 @@ namespace Dml } private: - BucketizedBufferAllocator* m_owner; + DmlReservedResourceSubAllocator* m_owner; size_t m_allocationId; // For debugging purposes Microsoft::WRL::ComPtr m_resourceWrapper; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h index 17ba37146bdc5..43e093538fcb6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h @@ -4,14 +4,14 @@ #pragma once #include "core/framework/allocator.h" -#include "BucketizedBufferAllocator.h" +#include "DmlReservedResourceSubAllocator.h" namespace Dml { class DmlBfcAllocator : public onnxruntime::IAllocator { public: - DmlBfcAllocator(std::shared_ptr subAllocator) + DmlBfcAllocator(std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( onnxruntime::DML, @@ -24,6 +24,6 @@ namespace Dml void* Alloc(size_t size_in_bytes) final { return m_subAllocator->Alloc(size_in_bytes); } void Free(void* ptr) final { m_subAllocator->Free(ptr); } private: - std::shared_ptr m_subAllocator; + std::shared_ptr m_subAllocator; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h index e442df1f1df6c..090166f296ffd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h @@ -11,7 +11,7 @@ namespace Dml { class CommandQueue; - class BucketizedBufferAllocator; + class DmlReservedResourceSubAllocator; class DmlCommandRecorder : public ICommandRecorder { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h index 6c5ee8cd29c6e..1c4d4b36628eb 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h @@ -7,7 +7,7 @@ namespace Dml { - class BucketizedBufferAllocator; + class DmlReservedResourceSubAllocator; class AllocationInfo; struct TaggedPointer; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 5bee9ee34ec4d..f2b62f2d41e64 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -6,12 +6,12 @@ #include "precomp.h" #include "DmlGpuAllocator.h" #include "core/framework/allocator.h" -#include "BucketizedBufferAllocator.h" +#include "DmlReservedResourceSubAllocator.h" #include "DmlTaggedPointer.h" namespace Dml { - DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) + DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( onnxruntime::DML, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index 3bc8127598460..39311055503d2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -9,14 +9,14 @@ namespace Dml { - class BucketizedBufferAllocator; + class DmlReservedResourceSubAllocator; class AllocationInfo; struct TaggedPointer; class DmlGpuAllocator : public onnxruntime::IAllocator { public: - DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator); + DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator); void* Alloc(size_t size_in_bytes) final; void Free(void* ptr) final; @@ -32,6 +32,6 @@ namespace Dml // This allocator is specific to DML and is used to decode the opaque data returned by the BFC // allocator into objects that DML understands - std::shared_ptr m_subAllocator; + std::shared_ptr m_subAllocator; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp similarity index 90% rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index 5c488e4376733..c82e0a4f5d722 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -4,18 +4,18 @@ #include "precomp.h" #include "core/session/onnxruntime_c_api.h" -#include "BucketizedBufferAllocator.h" +#include "DmlReservedResourceSubAllocator.h" #include "DmlReservedResourceWrapper.h" #include "DmlBufferRegion.h" namespace Dml { - BucketizedBufferAllocator::~BucketizedBufferAllocator() + DmlReservedResourceSubAllocator::~DmlReservedResourceSubAllocator() { #ifdef PRINT_OUTSTANDING_ALLOCATIONS if (!m_outstandingAllocationsById.empty()) { - printf("BucketizedBufferAllocator outstanding allocation indices:\n"); + printf("DmlReservedResourceSubAllocator outstanding allocation indices:\n"); for (auto& entry : m_outstandingAllocationsById) { printf("%u\n", static_cast(entry.first)); @@ -25,7 +25,7 @@ namespace Dml #endif } - /*static*/ gsl::index BucketizedBufferAllocator::GetBucketIndexFromSize(uint64_t size) + /*static*/ gsl::index DmlReservedResourceSubAllocator::GetBucketIndexFromSize(uint64_t size) { assert(size != 0); @@ -40,12 +40,12 @@ namespace Dml return index; } - /*static*/ uint64_t BucketizedBufferAllocator::GetBucketSizeFromIndex(gsl::index index) + /*static*/ uint64_t DmlReservedResourceSubAllocator::GetBucketSizeFromIndex(gsl::index index) { return (1ull << (index + c_minResourceSizeExponent)); } - void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) + void DmlReservedResourceSubAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { m_defaultRoundingMode = roundingMode; } @@ -66,10 +66,10 @@ namespace Dml static uint64_t GetMaxHeapSizeInTiles() { - return BucketizedBufferAllocator::kDefaultMaxHeapSizeInTiles; + return DmlReservedResourceSubAllocator::kDefaultMaxHeapSizeInTiles; } - BucketizedBufferAllocator::BucketizedBufferAllocator( + DmlReservedResourceSubAllocator::DmlReservedResourceSubAllocator( ID3D12Device* device, std::shared_ptr context, ID3D12CommandQueue* queue, @@ -89,7 +89,7 @@ namespace Dml { } - absl::optional BucketizedBufferAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes) + absl::optional DmlReservedResourceSubAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes) { DmlHeapAllocation allocation = {}; @@ -207,7 +207,7 @@ namespace Dml return allocation; } - absl::optional BucketizedBufferAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes) + absl::optional DmlReservedResourceSubAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes) { DmlHeapAllocation allocation = {}; @@ -256,7 +256,7 @@ namespace Dml return allocation; } - uint64_t BucketizedBufferAllocator::ComputeRequiredSize(size_t size) + uint64_t DmlReservedResourceSubAllocator::ComputeRequiredSize(size_t size) { const uint64_t resource_size_in_tiles = 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; @@ -266,7 +266,7 @@ namespace Dml return resource_size_in_bytes; } - void* BucketizedBufferAllocator::Alloc(size_t size_in_bytes) + void* DmlReservedResourceSubAllocator::Alloc(size_t size_in_bytes) { // For some reason lotus likes requesting 0 bytes of memory size_in_bytes = std::max(1, size_in_bytes); @@ -307,7 +307,7 @@ namespace Dml return TaggedPointer::Pack(device_id, *allocationId, offset); } - void BucketizedBufferAllocator::Free(void* ptr) + void DmlReservedResourceSubAllocator::Free(void* ptr) { ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); @@ -326,7 +326,7 @@ namespace Dml allocations_by_id_.erase(it); } - void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo) + void DmlReservedResourceSubAllocator::FreeResource(AllocationInfo* allocInfo) { // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already // close at this point if the application is winding down. @@ -344,7 +344,7 @@ namespace Dml } } - absl::optional BucketizedBufferAllocator::TryReserveAllocationID() + absl::optional DmlReservedResourceSubAllocator::TryReserveAllocationID() { // The mutex must already be held assert(!mutex_.try_lock()); @@ -369,7 +369,7 @@ namespace Dml return current_allocation_id_; } - void BucketizedBufferAllocator::ReleaseAllocationID(uint32_t id) + void DmlReservedResourceSubAllocator::ReleaseAllocationID(uint32_t id) { // The mutex must already be held assert(!mutex_.try_lock()); @@ -378,7 +378,7 @@ namespace Dml free_allocation_ids_.push_back(id); } - D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion( + D3D12BufferRegion DmlReservedResourceSubAllocator::CreateBufferRegion( const TaggedPointer& taggedPointer, uint64_t size_in_bytes) { @@ -405,7 +405,7 @@ namespace Dml it->second->GetCopyDstResource()); } - AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) + AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) { // We need to access (mutable) state after this point, so we need to lock std::unique_lock lock(mutex_); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h similarity index 92% rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 73e7a0a317984..1d7c8704ab7da 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -9,8 +9,8 @@ namespace Dml { - class BucketizedBufferAllocator; - class BucketizedBufferAllocator; + class DmlReservedResourceSubAllocator; + class DmlReservedResourceSubAllocator; struct TaggedPointer; // An allocator that makes logically contiguous allocations backed by D3D heaps. @@ -36,7 +36,7 @@ namespace Dml // this case it is better make more but smaller allocations (resulting in // smaller heaps); this fallback path is only retained as a last resort for // older hardware. - class BucketizedBufferAllocator + class DmlReservedResourceSubAllocator { public: // Maximum size of a heap (in tiles) when allocations are tiled. Each tile @@ -44,7 +44,7 @@ namespace Dml // local video memory fragmentation without requiring lots of heaps. static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512; - BucketizedBufferAllocator( + DmlReservedResourceSubAllocator( ID3D12Device* device, std::shared_ptr context, ID3D12CommandQueue* queue, @@ -70,14 +70,14 @@ namespace Dml uint64_t ComputeRequiredSize(size_t size); bool TilingEnabled() const { return tiling_enabled_; }; - ~BucketizedBufferAllocator(); + ~DmlReservedResourceSubAllocator(); - // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties, + // Constructs a DmlReservedResourceSubAllocator which allocates D3D12 committed resources with the specified heap properties, // resource flags, and initial resource state. - BucketizedBufferAllocator( + DmlReservedResourceSubAllocator( ID3D12Device* device, std::shared_ptr context, - std::unique_ptr&& subAllocator); + std::unique_ptr&& subAllocator); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); void* Alloc(size_t size); @@ -109,7 +109,7 @@ namespace Dml size_t m_currentAllocationId = 0; uint64_t m_currentResourceId = 0; AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled; - std::unique_ptr m_subAllocator; + std::unique_ptr m_subAllocator; #if _DEBUG // Useful for debugging; keeps track of all allocations that haven't been freed yet diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index 6a30e6cd1ad56..d6a46e354c769 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -37,6 +37,78 @@ namespace Dml SetCommandRecorder(&m_dmlRecorder); + // This type of copy is not common and is only used in rare circumstances. Because a resource + // cannot be both in a source and destination state at the same time (without aliasing), we copy + // the source resource to an intermediate one, and then copy the intermediate resource to the + // destination resource. + // TODO (pavignol): Only do the intermediate copy when both resources at the same + + D3D12_HEAP_PROPERTIES heapProperties = { + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + + D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER, + 0, + byteCount, + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + {1, 0}, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; + + ComPtr intermediateBuffer; + ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource( + &heapProperties, + D3D12_HEAP_FLAG_NONE, + &resourceDesc, + D3D12_RESOURCE_STATE_COPY_DEST, + nullptr, + IID_GRAPHICS_PPV_ARGS(intermediateBuffer.GetAddressOf()))); + + std::vector barriers; + + if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE)); + m_dmlRecorder.ResourceBarrier(barriers); + } + + m_dmlRecorder.CopyBufferRegion(intermediateBuffer.Get(), 0, srcBuffer, srcOffset, byteCount); + + // Reset src barrier state + for (auto& barrier : barriers) + { + std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); + } + + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(intermediateBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE)); + + if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST)); + } + + m_dmlRecorder.ResourceBarrier(barriers); + m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount); + + barriers.clear(); + + // Reset dst barrier state + if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); + } + + // Since this copy may write to GPU memory, we also need to perform an aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); + m_dmlRecorder.ResourceBarrier(barriers); + + // Keep the intermediate buffer alive until we're done with it + QueueReference(intermediateBuffer.Get()); + + +/* std::vector barriers; if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) @@ -65,6 +137,7 @@ namespace Dml // aliasing barrier barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); +*/ } void ExecutionContext::FillBufferWithPattern( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index afe90d9a25f1a..f72035f5e5fda 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -8,7 +8,7 @@ #include "PooledUploadHeap.h" #include "ReadbackHeap.h" #include "ExecutionContext.h" -#include "BucketizedBufferAllocator.h" +#include "DmlReservedResourceSubAllocator.h" #include "DmlCpuAllocator.h" #include "MLOperatorAuthorImpl.h" #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h" @@ -190,7 +190,7 @@ namespace Dml std::vector ExecutionProviderImpl::CreatePreferredAllocators() { if (!m_gpuAllocator) { - auto subAllocator = std::make_shared( + auto subAllocator = std::make_shared( m_d3d12Device.Get(), m_context, // TODO(leca): REVIEW: Will it cause memory issue when m_context is released in EP while alloc is released in sessionState? m_queue.Get(), diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index a959042dab32c..ad208ea830ae5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -24,7 +24,7 @@ namespace Dml class PooledUploadHeap; class ReadbackHeap; class ExecutionContext; - class BucketizedBufferAllocator; + class DmlReservedResourceSubAllocator; class DmlCpuAllocator; class ExecutionProvider; class DmlGpuAllocator; @@ -181,7 +181,7 @@ namespace Dml std::shared_ptr m_context; std::unique_ptr m_uploadHeap; std::unique_ptr m_readbackHeap; - std::shared_ptr m_subAllocator; + std::shared_ptr m_subAllocator; std::shared_ptr m_bfcAllocator; std::shared_ptr m_gpuAllocator; std::shared_ptr m_externalGpuAllocator; From e0569c593e11ff577c27f8b1d620ac0b5ab919b3 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 11 Jul 2023 17:58:29 -0700 Subject: [PATCH 41/76] Remove aliasing --- .../src/DmlAllocationInfo.h | 10 --- .../DmlExecutionProvider/src/DmlBuffer.cpp | 10 --- .../dml/DmlExecutionProvider/src/DmlBuffer.h | 3 +- .../src/DmlBufferRegion.cpp | 76 +++-------------- .../src/DmlBufferRegion.h | 34 ++------ .../src/DmlCommandRecorder.cpp | 15 +--- .../src/DmlCommittedResourceWrapper.h | 6 -- .../src/DmlGraphFusionHelper.cpp | 2 +- .../src/DmlReservedResourceSubAllocator.cpp | 4 +- .../src/DmlReservedResourceSubAllocator.h | 4 +- .../src/DmlReservedResourceWrapper.h | 6 -- .../src/DmlResourceWrapper.h | 6 +- .../src/ExecutionContext.cpp | 3 - .../src/ExecutionProvider.cpp | 78 +++++------------- .../src/FusedGraphKernel.cpp | 4 +- .../src/MLOperatorAuthorImpl.cpp | 12 +-- .../src/MLOperatorAuthorImpl.h | 4 +- .../src/Operators/DmlDFT.h | 81 ++++++++++++++++++- .../src/Operators/DmlGridSample.h | 60 ++++++++++++-- .../src/Operators/DmlSTFT.h | 22 ++++- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 4 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 2 +- 22 files changed, 201 insertions(+), 245 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 546a42342a2a0..7c11358bb106d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -36,16 +36,6 @@ namespace Dml return m_resourceWrapper->GetUavResource(); } - ID3D12Resource* GetCopySrcResource() const - { - return m_resourceWrapper->GetCopySrcResource(); - } - - ID3D12Resource* GetCopyDstResource() const - { - return m_resourceWrapper->GetCopyDstResource(); - } - ComPtr DetachResourceWrapper() const { return std::move(m_resourceWrapper); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index c5fa576d24a0f..464ce26c16f54 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -48,16 +48,6 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const return buffer_region_.ResourceInUavState(); } -ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const -{ - return buffer_region_.ResourceInCopySrcState(); -} - -ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const -{ - return buffer_region_.ResourceInCopyDstState(); -} - uint64_t DmlBuffer::Offset() const { return buffer_region_ ? buffer_region_.Offset() : 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index b98ae727e1a65..4b0dd58ce4467 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -27,9 +27,8 @@ class DmlBuffer DmlBuffer(DmlBuffer&&); DmlBuffer& operator=(DmlBuffer&&); + // TODO (pavignol): Rename to Resource() ID3D12Resource* ResourceInUavState() const; - ID3D12Resource* ResourceInCopySrcState() const; - ID3D12Resource* ResourceInCopyDstState() const; uint64_t Offset() const; uint64_t SizeInBytes() const; const D3D12BufferRegion& Region() const { return buffer_region_; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index 3240042b5b6a6..c33cc5491c7f0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -7,114 +7,64 @@ namespace Dml { - D3D12BufferRegion::D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state) - : resource_uav_state_(resource_uav_state), - resource_copy_src_state_(resource_copy_src_state), - resource_copy_dst_state_(resource_copy_dst_state), + D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource) + : m_resource(resource), offset_(offset), size_in_bytes_(size_in_bytes) { - // Get a raw pointer to the first non-null resource passed in. At least one - // resource must be provided. - first_valid_resource_ = resource_uav_state_; - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_src_state_; - } - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_dst_state_; - } - ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr); + ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr); // Regions cannot be empty. ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = first_valid_resource_->GetDesc().Width; + uint64_t buffer_size = m_resource->GetDesc().Width; ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); // All three resources, if provided, must be identical aside from state. - assert( - first_valid_resource_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER); - assert( - !resource_uav_state || - (resource_uav_state->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_uav_state->GetDesc().Width == buffer_size)); - assert( - !resource_copy_src_state_ || - (resource_copy_src_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_src_state_->GetDesc().Width == buffer_size)); - assert( - !resource_copy_dst_state_ || - (resource_copy_dst_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_dst_state_->GetDesc().Width == buffer_size)); + assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER); + assert(m_resource->GetDesc().Width == buffer_size); } D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) { - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); } D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) { - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); return *this; } ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const { - return resource_uav_state_; - } - - ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const - { - return resource_copy_src_state_; - } - - ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const - { - return resource_copy_dst_state_; + return m_resource; } uint64_t D3D12BufferRegion::Offset() const { - return first_valid_resource_ ? offset_ : 0; + return m_resource ? offset_ : 0; } uint64_t D3D12BufferRegion::SizeInBytes() const { - return first_valid_resource_ ? size_in_bytes_ : 0; + return m_resource ? size_in_bytes_ : 0; } DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const { - if (!resource_uav_state_) + if (!m_resource) { return DML_BUFFER_BINDING{}; } - return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; + return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_}; } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index dee01a29fe55f..6c5cb37297caa 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -19,61 +19,39 @@ namespace Dml D3D12BufferRegion( uint64_t offset, uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state); + ID3D12Resource* resource); // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; D3D12BufferRegion(D3D12BufferRegion&&); D3D12BufferRegion& operator=(D3D12BufferRegion&&); - ID3D12Resource* ResourceInUavState() const; - // NOTE: may be any state that is valid as a copy source (COPY_SRC, - // GENERIC_READ, or COMMON). - ID3D12Resource* ResourceInCopySrcState() const; - - ID3D12Resource* ResourceInCopyDstState() const; - uint64_t Offset() const; uint64_t SizeInBytes() const; DML_BUFFER_BINDING GetBufferBinding() const; - explicit operator bool() const { return first_valid_resource_ != nullptr; } + explicit operator bool() const { return m_resource != nullptr; } // Creates a subregion at an offset from the start of this region. If no // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion( - uint64_t offset, - uint64_t size_in_bytes = 0) const + inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const { // start of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); - size_in_bytes = - size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; // end of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); - return D3D12BufferRegion( - offset_ + offset, - size_in_bytes, - resource_uav_state_, - resource_copy_src_state_, - resource_copy_dst_state_); + return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource); } private: - ID3D12Resource* resource_uav_state_ = nullptr; - ID3D12Resource* resource_copy_src_state_ = nullptr; - ID3D12Resource* resource_copy_dst_state_ = nullptr; + ID3D12Resource* m_resource = nullptr; uint64_t offset_ = 0; uint64_t size_in_bytes_ = 0; - - // Pointer to the first resource above that isn't null. - ID3D12Resource* first_valid_resource_ = nullptr; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index af625334b7720..862884c22b08c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -94,10 +94,8 @@ void DmlCommandRecorder::InitializeOperator( if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) || (temporaryResourceSize > 0)) { - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); + auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); + m_currentCommandList->ResourceBarrier(1, &uav); } } @@ -156,13 +154,8 @@ void DmlCommandRecorder::ExecuteOperator( // Barrier all outputs. #pragma warning(push) #pragma warning(disable: 6387) - - // Barrier all outputs. - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); - + auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); + m_currentCommandList->ResourceBarrier(1, &uav); #pragma warning(pop) } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index f786cca837f06..4b9c167dfe671 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -12,12 +12,6 @@ namespace Dml // Committed resources use the same resource for all states and use barriers to transition between states ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); } - - D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index cd6b241e70d48..dcf6b8607f319 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper // The allocation is not pooled auto allocInfo = static_cast(opaqueData); *allocId = 0; - return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); } auto taggedPointer = TaggedPointer::Unpack(opaqueData); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index c82e0a4f5d722..0dc07384ea905 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -400,9 +400,7 @@ namespace Dml return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, - it->second->GetUavResource(), - it->second->GetCopySrcResource(), - it->second->GetCopyDstResource()); + it->second->GetUavResource()); } AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 1d7c8704ab7da..8049848c8671e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -60,9 +60,7 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion( - const TaggedPointer& taggedPointer, - uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 68feab568ca45..22f8cbbdc394b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -18,12 +18,6 @@ namespace Dml } ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } - ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } - ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } - - D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; } - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 03e9f762b7eb4..2b1a8e5c726dc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,12 +11,8 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: + // TODO (pavignol): Rename to GetResource() virtual ID3D12Resource* GetUavResource() const = 0; - virtual ID3D12Resource* GetCopySrcResource() const = 0; - virtual ID3D12Resource* GetCopyDstResource() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index d6a46e354c769..b5492a1a86ea3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -100,8 +100,6 @@ namespace Dml barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); } - // Since this copy may write to GPU memory, we also need to perform an aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); // Keep the intermediate buffer alive until we're done with it @@ -135,7 +133,6 @@ namespace Dml // Since this copy may write to GPU memory, we also need to perform an // aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); */ } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index f72035f5e5fda..b06d23adf5886 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -443,15 +443,8 @@ namespace Dml // CPU -> GPU copy (upload) // auto dstBufferRegion = GetBufferForTensor(dst); - - ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? dstBufferRegion.ResourceInUavState() - : dstBufferRegion.ResourceInCopyDstState(); - - const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; - + ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); FlushUploadsIfReady(); @@ -462,47 +455,26 @@ namespace Dml // GPU -> CPU copy (readback) // auto srcBufferRegion = GetBufferForTensor(src); - - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; - + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t srcOffset = srcBufferRegion.Offset(); m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); } else if (!src->IsCpuData() && !dst->IsCpuData()) { - printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); - // // GPU -> GPU copy // auto srcBufferRegion = GetBufferForTensor(src); - - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + const uint64_t srcOffset = srcBufferRegion.Offset(); auto dstBufferRegion = GetBufferForTensor(dst); - - ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? dstBufferRegion.ResourceInUavState() - : dstBufferRegion.ResourceInCopyDstState(); - - const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; - - const uint64_t srcOffset = srcBufferRegion.Offset(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); + m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes); } else @@ -524,7 +496,6 @@ namespace Dml // Source and destination for batched GPU -> CPU copies std::vector srcDatas; - std::vector srcStates; std::vector srcOffsets; std::vector dstDatas; std::vector dataSizesInBytes; @@ -557,21 +528,16 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(src[i]); - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); - srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); return S_OK; } @@ -941,9 +907,6 @@ namespace Dml std::vector srcDatas; srcDatas.reserve(src_dst_pairs.size()); - std::vector srcStates; - srcStates.reserve(src_dst_pairs.size()); - std::vector srcOffsets; srcOffsets.reserve(src_dst_pairs.size()); @@ -993,21 +956,16 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); - srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); return onnxruntime::common::Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 35955b113b2c1..b00b8f8e19f52 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -124,9 +124,7 @@ namespace Dml inputBufferRegions[i] = D3D12BufferRegion( 0, m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width, - m_nonOwnedGraphInputsFromInitializers[i].Get(), - nullptr, - nullptr); + m_nonOwnedGraphInputsFromInitializers[i].Get()); } else if (!m_isInputsUploadedByDmlEP[i]) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 1e7ebdc234c22..dde290f0bce0f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { auto allocInfo = static_cast(m_tensorData); - return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); } auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); @@ -1716,8 +1716,6 @@ namespace Windows::AI::MachineLearning::Adapter } } - // TODO (pavignol): Fix once we go back to a single resource - /* void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp) { if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) @@ -1769,9 +1767,9 @@ namespace Windows::AI::MachineLearning::Adapter } } - for (auto& tempAlloc : m_temporaryAbiAllocations) + for (auto& tempBuffer : m_temporaryBuffers) { - resourcesToTransition.push_back(tempAlloc.Get()); + resourcesToTransition.push_back(tempBuffer.ResourceInUavState()); } m_winmlProvider->TransitionResourcesForOperator( @@ -1780,7 +1778,6 @@ namespace Windows::AI::MachineLearning::Adapter resourcesToTransition.data()); } } - */ OpKernelContextWrapper::OpKernelContextWrapper( onnxruntime::OpKernelContext* context, @@ -1828,13 +1825,10 @@ namespace Windows::AI::MachineLearning::Adapter void OpKernelContextWrapper::Close() { - // TODO (pavignol): Fix once we go back to a single resource - /* if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) { TransitionResourcesForOperatorIfRequired(false); } - */ for (auto& tensors : m_inputTensors) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 647e0a17d26df..85b6b197fe511 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -512,9 +512,7 @@ class OpKernelContextWrapper : public WRL::Base const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams; // Get resources + auto inputBufferRegion = bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion; + auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion; auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion; auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion; + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[2]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(2, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get()); commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get()); @@ -764,6 +783,21 @@ class GpuDFTOperator : public WRL::Base chirpLength *= (m_isInverse ? 1 : -1); float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f; StockhamFFT(fft_params, true, chirpLength, scale, commandList); + + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(2, barriers); } void StockhamFFT( @@ -779,8 +813,27 @@ class GpuDFTOperator : public WRL::Base const auto& loopList = stockhamParams.ResourceLoopList; // Get input and output resources + auto inputBufferRegion = loopList[0].BufferRegion; + auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion; auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion; + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[2]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(2, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get()); commandList->SetPipelineState(m_stockhamFFTPipelineState.Get()); @@ -822,6 +875,21 @@ class GpuDFTOperator : public WRL::Base std::array uav_resources = { in, out, window }; Dispatch(uav_resources, constants, commandList); } + + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -838,6 +906,14 @@ class GpuDFTOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[TSize]; + + std::transform( + bufferRegions.begin(), bufferRegions.end(), + uav_barriers, + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + commandList->ResourceBarrier(TSize, uav_barriers); + for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -886,10 +962,7 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + commandList->ResourceBarrier(TSize, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 8863bd5362d27..0611c4b7bf7f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -683,6 +683,29 @@ class DmlGridSampleOperator : public WRL::Base Dml::GetDescendingPackedStrides(gridDims, gridStrides); Dml::GetDescendingPackedStrides(outputDims, outputStrides); + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[3]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + gridBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(3, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get()); commandList->SetPipelineState(m_gridSamplePipelineState.Get()); @@ -704,10 +727,26 @@ class DmlGridSampleOperator : public WRL::Base std::array uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion }; Dispatch(uavBufferRegions, constants, commandList); - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + gridBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(3, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -724,6 +763,14 @@ class DmlGridSampleOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[TSize]; + + std::transform( + bufferRegions.begin(), bufferRegions.end(), + uav_barriers, + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + commandList->ResourceBarrier(TSize, uav_barriers); + for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -772,10 +819,7 @@ class DmlGridSampleOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + commandList->ResourceBarrier(TSize, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index cd1f78e2a23a6..945b58965cf2f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -405,9 +405,15 @@ class DmlSTFTOperator : public WRL::Base std::array inputBindings; uint32_t inputBindingsCount = 1; + // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking + // barrierCount is outside the valid range. + D3D12_RESOURCE_BARRIER barriers[3]; + uint32_t barrierCount = 0; + Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Dml::D3D12BufferRegion windowBufferRegion; if (m_framingOperator.hasWindowTensor) @@ -415,6 +421,7 @@ class DmlSTFTOperator : public WRL::Base windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window); inputBuffers[1] = windowBufferRegion.GetBufferBinding(); inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); inputBindingsCount++; } @@ -422,6 +429,7 @@ class DmlSTFTOperator : public WRL::Base DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding(); DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); m_framingOperator.bindingTable->BindOutputs(1, &outputBinding); @@ -443,16 +451,22 @@ class DmlSTFTOperator : public WRL::Base m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc); } + // Transition resources COMMON -> UAV + commandList->ResourceBarrier(barrierCount, barriers); + m_framingOperator.commandRecorder->RecordDispatch( commandList, m_framingOperator.op.Get(), m_framingOperator.bindingTable.Get() ); - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + // Transition resources UAV -> COMMON + for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++) + { + std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter); + } + + commandList->ResourceBarrier(barrierCount, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index a91886c3b5863..5bb04ba4d30b5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -106,7 +106,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - gsl::span srcStates) + D3D12_RESOURCE_STATES srcState) { assert(dst.size() == src.size()); assert(dstSizes.size() == src.size()); @@ -134,7 +134,7 @@ namespace Dml D3D12_RESOURCE_STATE_COPY_DEST, src[i], srcOffsets[i], - srcStates[i], + srcState, dstSizes[i]); offset += dstSizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index f888f0a55ac48..4a65ce899d791 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -28,7 +28,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - gsl::span srcStates); + D3D12_RESOURCE_STATES srcState); private: void EnsureReadbackHeap(size_t size); From 568e55039044980bffc4b3776ad73446dc322792 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 11 Jul 2023 22:07:30 -0700 Subject: [PATCH 42/76] Revert "Remove aliasing" This reverts commit e0569c593e11ff577c27f8b1d620ac0b5ab919b3. --- .../src/DmlAllocationInfo.h | 10 +++ .../DmlExecutionProvider/src/DmlBuffer.cpp | 10 +++ .../dml/DmlExecutionProvider/src/DmlBuffer.h | 3 +- .../src/DmlBufferRegion.cpp | 76 ++++++++++++++--- .../src/DmlBufferRegion.h | 34 ++++++-- .../src/DmlCommandRecorder.cpp | 15 +++- .../src/DmlCommittedResourceWrapper.h | 6 ++ .../src/DmlGraphFusionHelper.cpp | 2 +- .../src/DmlReservedResourceSubAllocator.cpp | 4 +- .../src/DmlReservedResourceSubAllocator.h | 4 +- .../src/DmlReservedResourceWrapper.h | 6 ++ .../src/DmlResourceWrapper.h | 6 +- .../src/ExecutionContext.cpp | 3 + .../src/ExecutionProvider.cpp | 78 +++++++++++++----- .../src/FusedGraphKernel.cpp | 4 +- .../src/MLOperatorAuthorImpl.cpp | 12 ++- .../src/MLOperatorAuthorImpl.h | 4 +- .../src/Operators/DmlDFT.h | 81 +------------------ .../src/Operators/DmlGridSample.h | 60 ++------------ .../src/Operators/DmlSTFT.h | 22 +---- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 4 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 2 +- 22 files changed, 245 insertions(+), 201 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 7c11358bb106d..546a42342a2a0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -36,6 +36,16 @@ namespace Dml return m_resourceWrapper->GetUavResource(); } + ID3D12Resource* GetCopySrcResource() const + { + return m_resourceWrapper->GetCopySrcResource(); + } + + ID3D12Resource* GetCopyDstResource() const + { + return m_resourceWrapper->GetCopyDstResource(); + } + ComPtr DetachResourceWrapper() const { return std::move(m_resourceWrapper); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index 464ce26c16f54..c5fa576d24a0f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -48,6 +48,16 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const return buffer_region_.ResourceInUavState(); } +ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const +{ + return buffer_region_.ResourceInCopySrcState(); +} + +ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const +{ + return buffer_region_.ResourceInCopyDstState(); +} + uint64_t DmlBuffer::Offset() const { return buffer_region_ ? buffer_region_.Offset() : 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index 4b0dd58ce4467..b98ae727e1a65 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -27,8 +27,9 @@ class DmlBuffer DmlBuffer(DmlBuffer&&); DmlBuffer& operator=(DmlBuffer&&); - // TODO (pavignol): Rename to Resource() ID3D12Resource* ResourceInUavState() const; + ID3D12Resource* ResourceInCopySrcState() const; + ID3D12Resource* ResourceInCopyDstState() const; uint64_t Offset() const; uint64_t SizeInBytes() const; const D3D12BufferRegion& Region() const { return buffer_region_; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index c33cc5491c7f0..3240042b5b6a6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -7,64 +7,114 @@ namespace Dml { - D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource) - : m_resource(resource), + D3D12BufferRegion::D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state) + : resource_uav_state_(resource_uav_state), + resource_copy_src_state_(resource_copy_src_state), + resource_copy_dst_state_(resource_copy_dst_state), offset_(offset), size_in_bytes_(size_in_bytes) { - ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr); + // Get a raw pointer to the first non-null resource passed in. At least one + // resource must be provided. + first_valid_resource_ = resource_uav_state_; + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_src_state_; + } + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_dst_state_; + } + ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr); // Regions cannot be empty. ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = m_resource->GetDesc().Width; + uint64_t buffer_size = first_valid_resource_->GetDesc().Width; ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); // All three resources, if provided, must be identical aside from state. - assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER); - assert(m_resource->GetDesc().Width == buffer_size); + assert( + first_valid_resource_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER); + assert( + !resource_uav_state || + (resource_uav_state->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_uav_state->GetDesc().Width == buffer_size)); + assert( + !resource_copy_src_state_ || + (resource_copy_src_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_src_state_->GetDesc().Width == buffer_size)); + assert( + !resource_copy_dst_state_ || + (resource_copy_dst_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_dst_state_->GetDesc().Width == buffer_size)); } D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) { - std::swap(this->m_resource, that.m_resource); + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); } D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) { - std::swap(this->m_resource, that.m_resource); + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); return *this; } ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const { - return m_resource; + return resource_uav_state_; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const + { + return resource_copy_src_state_; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const + { + return resource_copy_dst_state_; } uint64_t D3D12BufferRegion::Offset() const { - return m_resource ? offset_ : 0; + return first_valid_resource_ ? offset_ : 0; } uint64_t D3D12BufferRegion::SizeInBytes() const { - return m_resource ? size_in_bytes_ : 0; + return first_valid_resource_ ? size_in_bytes_ : 0; } DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const { - if (!m_resource) + if (!resource_uav_state_) { return DML_BUFFER_BINDING{}; } - return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_}; + return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index 6c5cb37297caa..dee01a29fe55f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -19,39 +19,61 @@ namespace Dml D3D12BufferRegion( uint64_t offset, uint64_t size_in_bytes, - ID3D12Resource* resource); + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state); // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; D3D12BufferRegion(D3D12BufferRegion&&); D3D12BufferRegion& operator=(D3D12BufferRegion&&); + ID3D12Resource* ResourceInUavState() const; + // NOTE: may be any state that is valid as a copy source (COPY_SRC, + // GENERIC_READ, or COMMON). + ID3D12Resource* ResourceInCopySrcState() const; + + ID3D12Resource* ResourceInCopyDstState() const; + uint64_t Offset() const; uint64_t SizeInBytes() const; DML_BUFFER_BINDING GetBufferBinding() const; - explicit operator bool() const { return m_resource != nullptr; } + explicit operator bool() const { return first_valid_resource_ != nullptr; } // Creates a subregion at an offset from the start of this region. If no // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const + inline D3D12BufferRegion Subregion( + uint64_t offset, + uint64_t size_in_bytes = 0) const { // start of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); - size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + size_in_bytes = + size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; // end of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); - return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource); + return D3D12BufferRegion( + offset_ + offset, + size_in_bytes, + resource_uav_state_, + resource_copy_src_state_, + resource_copy_dst_state_); } private: - ID3D12Resource* m_resource = nullptr; + ID3D12Resource* resource_uav_state_ = nullptr; + ID3D12Resource* resource_copy_src_state_ = nullptr; + ID3D12Resource* resource_copy_dst_state_ = nullptr; uint64_t offset_ = 0; uint64_t size_in_bytes_ = 0; + + // Pointer to the first resource above that isn't null. + ID3D12Resource* first_valid_resource_ = nullptr; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 862884c22b08c..af625334b7720 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -94,8 +94,10 @@ void DmlCommandRecorder::InitializeOperator( if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) || (temporaryResourceSize > 0)) { - auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); - m_currentCommandList->ResourceBarrier(1, &uav); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); } } @@ -154,8 +156,13 @@ void DmlCommandRecorder::ExecuteOperator( // Barrier all outputs. #pragma warning(push) #pragma warning(disable: 6387) - auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); - m_currentCommandList->ResourceBarrier(1, &uav); + + // Barrier all outputs. + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); + #pragma warning(pop) } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index 4b9c167dfe671..f786cca837f06 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -12,6 +12,12 @@ namespace Dml // Committed resources use the same resource for all states and use barriers to transition between states ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); } + + D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index dcf6b8607f319..cd6b241e70d48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper // The allocation is not pooled auto allocInfo = static_cast(opaqueData); *allocId = 0; - return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); + return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); } auto taggedPointer = TaggedPointer::Unpack(opaqueData); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index 0dc07384ea905..c82e0a4f5d722 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -400,7 +400,9 @@ namespace Dml return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, - it->second->GetUavResource()); + it->second->GetUavResource(), + it->second->GetCopySrcResource(), + it->second->GetCopyDstResource()); } AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 8049848c8671e..1d7c8704ab7da 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -60,7 +60,9 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion( + const TaggedPointer& taggedPointer, + uint64_t size_in_bytes); AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 22f8cbbdc394b..68feab568ca45 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -18,6 +18,12 @@ namespace Dml } ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } + ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } + ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } + + D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; } + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 2b1a8e5c726dc..03e9f762b7eb4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,8 +11,12 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: - // TODO (pavignol): Rename to GetResource() virtual ID3D12Resource* GetUavResource() const = 0; + virtual ID3D12Resource* GetCopySrcResource() const = 0; + virtual ID3D12Resource* GetCopyDstResource() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index b5492a1a86ea3..d6a46e354c769 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -100,6 +100,8 @@ namespace Dml barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); } + // Since this copy may write to GPU memory, we also need to perform an aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); // Keep the intermediate buffer alive until we're done with it @@ -133,6 +135,7 @@ namespace Dml // Since this copy may write to GPU memory, we also need to perform an // aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); */ } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index b06d23adf5886..f72035f5e5fda 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -443,8 +443,15 @@ namespace Dml // CPU -> GPU copy (upload) // auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); - const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + + ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? dstBufferRegion.ResourceInUavState() + : dstBufferRegion.ResourceInCopyDstState(); + + const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; + const uint64_t dstOffset = dstBufferRegion.Offset(); m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); FlushUploadsIfReady(); @@ -455,26 +462,47 @@ namespace Dml // GPU -> CPU copy (readback) // auto srcBufferRegion = GetBufferForTensor(src); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; + const uint64_t srcOffset = srcBufferRegion.Offset(); m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); } else if (!src->IsCpuData() && !dst->IsCpuData()) { + printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); + // // GPU -> GPU copy // auto srcBufferRegion = GetBufferForTensor(src); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - const uint64_t srcOffset = srcBufferRegion.Offset(); + + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); - const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - const uint64_t dstOffset = dstBufferRegion.Offset(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? dstBufferRegion.ResourceInUavState() + : dstBufferRegion.ResourceInCopyDstState(); + + const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; + + const uint64_t srcOffset = srcBufferRegion.Offset(); + const uint64_t dstOffset = dstBufferRegion.Offset(); m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes); } else @@ -496,6 +524,7 @@ namespace Dml // Source and destination for batched GPU -> CPU copies std::vector srcDatas; + std::vector srcStates; std::vector srcOffsets; std::vector dstDatas; std::vector dataSizesInBytes; @@ -528,16 +557,21 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(src[i]); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; srcDatas.push_back(srcData); + srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); return S_OK; } @@ -907,6 +941,9 @@ namespace Dml std::vector srcDatas; srcDatas.reserve(src_dst_pairs.size()); + std::vector srcStates; + srcStates.reserve(src_dst_pairs.size()); + std::vector srcOffsets; srcOffsets.reserve(src_dst_pairs.size()); @@ -956,16 +993,21 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; srcDatas.push_back(srcData); + srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); return onnxruntime::common::Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index b00b8f8e19f52..35955b113b2c1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -124,7 +124,9 @@ namespace Dml inputBufferRegions[i] = D3D12BufferRegion( 0, m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width, - m_nonOwnedGraphInputsFromInitializers[i].Get()); + m_nonOwnedGraphInputsFromInitializers[i].Get(), + nullptr, + nullptr); } else if (!m_isInputsUploadedByDmlEP[i]) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index dde290f0bce0f..1e7ebdc234c22 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { auto allocInfo = static_cast(m_tensorData); - return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); + return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); } auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); @@ -1716,6 +1716,8 @@ namespace Windows::AI::MachineLearning::Adapter } } + // TODO (pavignol): Fix once we go back to a single resource + /* void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp) { if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) @@ -1767,9 +1769,9 @@ namespace Windows::AI::MachineLearning::Adapter } } - for (auto& tempBuffer : m_temporaryBuffers) + for (auto& tempAlloc : m_temporaryAbiAllocations) { - resourcesToTransition.push_back(tempBuffer.ResourceInUavState()); + resourcesToTransition.push_back(tempAlloc.Get()); } m_winmlProvider->TransitionResourcesForOperator( @@ -1778,6 +1780,7 @@ namespace Windows::AI::MachineLearning::Adapter resourcesToTransition.data()); } } + */ OpKernelContextWrapper::OpKernelContextWrapper( onnxruntime::OpKernelContext* context, @@ -1825,10 +1828,13 @@ namespace Windows::AI::MachineLearning::Adapter void OpKernelContextWrapper::Close() { + // TODO (pavignol): Fix once we go back to a single resource + /* if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) { TransitionResourcesForOperatorIfRequired(false); } + */ for (auto& tensors : m_inputTensors) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 85b6b197fe511..647e0a17d26df 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -512,7 +512,9 @@ class OpKernelContextWrapper : public WRL::Base const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams; // Get resources - auto inputBufferRegion = bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion; - auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion; auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion; auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion; - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[2]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(2, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get()); commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get()); @@ -783,21 +764,6 @@ class GpuDFTOperator : public WRL::Base chirpLength *= (m_isInverse ? 1 : -1); float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f; StockhamFFT(fft_params, true, chirpLength, scale, commandList); - - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(2, barriers); } void StockhamFFT( @@ -813,27 +779,8 @@ class GpuDFTOperator : public WRL::Base const auto& loopList = stockhamParams.ResourceLoopList; // Get input and output resources - auto inputBufferRegion = loopList[0].BufferRegion; - auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion; auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion; - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[2]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(2, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get()); commandList->SetPipelineState(m_stockhamFFTPipelineState.Get()); @@ -875,21 +822,6 @@ class GpuDFTOperator : public WRL::Base std::array uav_resources = { in, out, window }; Dispatch(uav_resources, constants, commandList); } - - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -906,14 +838,6 @@ class GpuDFTOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[TSize]; - - std::transform( - bufferRegions.begin(), bufferRegions.end(), - uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); - commandList->ResourceBarrier(TSize, uav_barriers); - for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -962,7 +886,10 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(TSize, uav_barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 0611c4b7bf7f7..8863bd5362d27 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -683,29 +683,6 @@ class DmlGridSampleOperator : public WRL::Base Dml::GetDescendingPackedStrides(gridDims, gridStrides); Dml::GetDescendingPackedStrides(outputDims, outputStrides); - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[3]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(3, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get()); commandList->SetPipelineState(m_gridSamplePipelineState.Get()); @@ -727,26 +704,10 @@ class DmlGridSampleOperator : public WRL::Base std::array uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion }; Dispatch(uavBufferRegions, constants, commandList); - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(3, barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -763,14 +724,6 @@ class DmlGridSampleOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[TSize]; - - std::transform( - bufferRegions.begin(), bufferRegions.end(), - uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); - commandList->ResourceBarrier(TSize, uav_barriers); - for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -819,7 +772,10 @@ class DmlGridSampleOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(TSize, uav_barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index 945b58965cf2f..cd1f78e2a23a6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -405,15 +405,9 @@ class DmlSTFTOperator : public WRL::Base std::array inputBindings; uint32_t inputBindingsCount = 1; - // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking - // barrierCount is outside the valid range. - D3D12_RESOURCE_BARRIER barriers[3]; - uint32_t barrierCount = 0; - Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Dml::D3D12BufferRegion windowBufferRegion; if (m_framingOperator.hasWindowTensor) @@ -421,7 +415,6 @@ class DmlSTFTOperator : public WRL::Base windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window); inputBuffers[1] = windowBufferRegion.GetBufferBinding(); inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); inputBindingsCount++; } @@ -429,7 +422,6 @@ class DmlSTFTOperator : public WRL::Base DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding(); DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); m_framingOperator.bindingTable->BindOutputs(1, &outputBinding); @@ -451,22 +443,16 @@ class DmlSTFTOperator : public WRL::Base m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc); } - // Transition resources COMMON -> UAV - commandList->ResourceBarrier(barrierCount, barriers); - m_framingOperator.commandRecorder->RecordDispatch( commandList, m_framingOperator.op.Get(), m_framingOperator.bindingTable.Get() ); - // Transition resources UAV -> COMMON - for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++) - { - std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter); - } - - commandList->ResourceBarrier(barrierCount, barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index 5bb04ba4d30b5..a91886c3b5863 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -106,7 +106,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - D3D12_RESOURCE_STATES srcState) + gsl::span srcStates) { assert(dst.size() == src.size()); assert(dstSizes.size() == src.size()); @@ -134,7 +134,7 @@ namespace Dml D3D12_RESOURCE_STATE_COPY_DEST, src[i], srcOffsets[i], - srcState, + srcStates[i], dstSizes[i]); offset += dstSizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index 4a65ce899d791..f888f0a55ac48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -28,7 +28,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - D3D12_RESOURCE_STATES srcState); + gsl::span srcStates); private: void EnsureReadbackHeap(size_t size); From 943ac58d7af0ceaa8aeee2c4f85f04d6bfcba1f8 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 12 Jul 2023 07:59:45 -0700 Subject: [PATCH 43/76] Re-add "Remove aliasing" This reverts commit 568e55039044980bffc4b3776ad73446dc322792. --- .../src/DmlAllocationInfo.h | 10 --- .../DmlExecutionProvider/src/DmlBuffer.cpp | 10 --- .../dml/DmlExecutionProvider/src/DmlBuffer.h | 3 +- .../src/DmlBufferRegion.cpp | 76 +++-------------- .../src/DmlBufferRegion.h | 34 ++------ .../src/DmlCommandRecorder.cpp | 15 +--- .../src/DmlCommittedResourceWrapper.h | 6 -- .../src/DmlGraphFusionHelper.cpp | 2 +- .../src/DmlReservedResourceSubAllocator.cpp | 4 +- .../src/DmlReservedResourceSubAllocator.h | 4 +- .../src/DmlReservedResourceWrapper.h | 6 -- .../src/DmlResourceWrapper.h | 6 +- .../src/ExecutionContext.cpp | 3 - .../src/ExecutionProvider.cpp | 78 +++++------------- .../src/FusedGraphKernel.cpp | 4 +- .../src/MLOperatorAuthorImpl.cpp | 12 +-- .../src/MLOperatorAuthorImpl.h | 4 +- .../src/Operators/DmlDFT.h | 81 ++++++++++++++++++- .../src/Operators/DmlGridSample.h | 60 ++++++++++++-- .../src/Operators/DmlSTFT.h | 22 ++++- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 4 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 2 +- 22 files changed, 201 insertions(+), 245 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 546a42342a2a0..7c11358bb106d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -36,16 +36,6 @@ namespace Dml return m_resourceWrapper->GetUavResource(); } - ID3D12Resource* GetCopySrcResource() const - { - return m_resourceWrapper->GetCopySrcResource(); - } - - ID3D12Resource* GetCopyDstResource() const - { - return m_resourceWrapper->GetCopyDstResource(); - } - ComPtr DetachResourceWrapper() const { return std::move(m_resourceWrapper); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index c5fa576d24a0f..464ce26c16f54 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -48,16 +48,6 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const return buffer_region_.ResourceInUavState(); } -ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const -{ - return buffer_region_.ResourceInCopySrcState(); -} - -ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const -{ - return buffer_region_.ResourceInCopyDstState(); -} - uint64_t DmlBuffer::Offset() const { return buffer_region_ ? buffer_region_.Offset() : 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index b98ae727e1a65..4b0dd58ce4467 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -27,9 +27,8 @@ class DmlBuffer DmlBuffer(DmlBuffer&&); DmlBuffer& operator=(DmlBuffer&&); + // TODO (pavignol): Rename to Resource() ID3D12Resource* ResourceInUavState() const; - ID3D12Resource* ResourceInCopySrcState() const; - ID3D12Resource* ResourceInCopyDstState() const; uint64_t Offset() const; uint64_t SizeInBytes() const; const D3D12BufferRegion& Region() const { return buffer_region_; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index 3240042b5b6a6..c33cc5491c7f0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -7,114 +7,64 @@ namespace Dml { - D3D12BufferRegion::D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state) - : resource_uav_state_(resource_uav_state), - resource_copy_src_state_(resource_copy_src_state), - resource_copy_dst_state_(resource_copy_dst_state), + D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource) + : m_resource(resource), offset_(offset), size_in_bytes_(size_in_bytes) { - // Get a raw pointer to the first non-null resource passed in. At least one - // resource must be provided. - first_valid_resource_ = resource_uav_state_; - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_src_state_; - } - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_dst_state_; - } - ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr); + ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr); // Regions cannot be empty. ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = first_valid_resource_->GetDesc().Width; + uint64_t buffer_size = m_resource->GetDesc().Width; ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); // All three resources, if provided, must be identical aside from state. - assert( - first_valid_resource_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER); - assert( - !resource_uav_state || - (resource_uav_state->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_uav_state->GetDesc().Width == buffer_size)); - assert( - !resource_copy_src_state_ || - (resource_copy_src_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_src_state_->GetDesc().Width == buffer_size)); - assert( - !resource_copy_dst_state_ || - (resource_copy_dst_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_dst_state_->GetDesc().Width == buffer_size)); + assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER); + assert(m_resource->GetDesc().Width == buffer_size); } D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) { - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); } D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) { - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); return *this; } ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const { - return resource_uav_state_; - } - - ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const - { - return resource_copy_src_state_; - } - - ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const - { - return resource_copy_dst_state_; + return m_resource; } uint64_t D3D12BufferRegion::Offset() const { - return first_valid_resource_ ? offset_ : 0; + return m_resource ? offset_ : 0; } uint64_t D3D12BufferRegion::SizeInBytes() const { - return first_valid_resource_ ? size_in_bytes_ : 0; + return m_resource ? size_in_bytes_ : 0; } DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const { - if (!resource_uav_state_) + if (!m_resource) { return DML_BUFFER_BINDING{}; } - return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; + return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_}; } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index dee01a29fe55f..6c5cb37297caa 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -19,61 +19,39 @@ namespace Dml D3D12BufferRegion( uint64_t offset, uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state); + ID3D12Resource* resource); // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; D3D12BufferRegion(D3D12BufferRegion&&); D3D12BufferRegion& operator=(D3D12BufferRegion&&); - ID3D12Resource* ResourceInUavState() const; - // NOTE: may be any state that is valid as a copy source (COPY_SRC, - // GENERIC_READ, or COMMON). - ID3D12Resource* ResourceInCopySrcState() const; - - ID3D12Resource* ResourceInCopyDstState() const; - uint64_t Offset() const; uint64_t SizeInBytes() const; DML_BUFFER_BINDING GetBufferBinding() const; - explicit operator bool() const { return first_valid_resource_ != nullptr; } + explicit operator bool() const { return m_resource != nullptr; } // Creates a subregion at an offset from the start of this region. If no // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion( - uint64_t offset, - uint64_t size_in_bytes = 0) const + inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const { // start of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); - size_in_bytes = - size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; // end of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); - return D3D12BufferRegion( - offset_ + offset, - size_in_bytes, - resource_uav_state_, - resource_copy_src_state_, - resource_copy_dst_state_); + return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource); } private: - ID3D12Resource* resource_uav_state_ = nullptr; - ID3D12Resource* resource_copy_src_state_ = nullptr; - ID3D12Resource* resource_copy_dst_state_ = nullptr; + ID3D12Resource* m_resource = nullptr; uint64_t offset_ = 0; uint64_t size_in_bytes_ = 0; - - // Pointer to the first resource above that isn't null. - ID3D12Resource* first_valid_resource_ = nullptr; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index af625334b7720..862884c22b08c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -94,10 +94,8 @@ void DmlCommandRecorder::InitializeOperator( if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) || (temporaryResourceSize > 0)) { - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); + auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); + m_currentCommandList->ResourceBarrier(1, &uav); } } @@ -156,13 +154,8 @@ void DmlCommandRecorder::ExecuteOperator( // Barrier all outputs. #pragma warning(push) #pragma warning(disable: 6387) - - // Barrier all outputs. - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); - + auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); + m_currentCommandList->ResourceBarrier(1, &uav); #pragma warning(pop) } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index f786cca837f06..4b9c167dfe671 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -12,12 +12,6 @@ namespace Dml // Committed resources use the same resource for all states and use barriers to transition between states ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); } - - D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index cd6b241e70d48..dcf6b8607f319 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper // The allocation is not pooled auto allocInfo = static_cast(opaqueData); *allocId = 0; - return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); } auto taggedPointer = TaggedPointer::Unpack(opaqueData); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index c82e0a4f5d722..0dc07384ea905 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -400,9 +400,7 @@ namespace Dml return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, - it->second->GetUavResource(), - it->second->GetCopySrcResource(), - it->second->GetCopyDstResource()); + it->second->GetUavResource()); } AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 1d7c8704ab7da..8049848c8671e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -60,9 +60,7 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion( - const TaggedPointer& taggedPointer, - uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 68feab568ca45..22f8cbbdc394b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -18,12 +18,6 @@ namespace Dml } ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } - ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } - ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } - - D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; } - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 03e9f762b7eb4..2b1a8e5c726dc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,12 +11,8 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: + // TODO (pavignol): Rename to GetResource() virtual ID3D12Resource* GetUavResource() const = 0; - virtual ID3D12Resource* GetCopySrcResource() const = 0; - virtual ID3D12Resource* GetCopyDstResource() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index d6a46e354c769..b5492a1a86ea3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -100,8 +100,6 @@ namespace Dml barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); } - // Since this copy may write to GPU memory, we also need to perform an aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); // Keep the intermediate buffer alive until we're done with it @@ -135,7 +133,6 @@ namespace Dml // Since this copy may write to GPU memory, we also need to perform an // aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); */ } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index f72035f5e5fda..b06d23adf5886 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -443,15 +443,8 @@ namespace Dml // CPU -> GPU copy (upload) // auto dstBufferRegion = GetBufferForTensor(dst); - - ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? dstBufferRegion.ResourceInUavState() - : dstBufferRegion.ResourceInCopyDstState(); - - const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; - + ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); FlushUploadsIfReady(); @@ -462,47 +455,26 @@ namespace Dml // GPU -> CPU copy (readback) // auto srcBufferRegion = GetBufferForTensor(src); - - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; - + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t srcOffset = srcBufferRegion.Offset(); m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); } else if (!src->IsCpuData() && !dst->IsCpuData()) { - printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); - // // GPU -> GPU copy // auto srcBufferRegion = GetBufferForTensor(src); - - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + const uint64_t srcOffset = srcBufferRegion.Offset(); auto dstBufferRegion = GetBufferForTensor(dst); - - ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? dstBufferRegion.ResourceInUavState() - : dstBufferRegion.ResourceInCopyDstState(); - - const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; - - const uint64_t srcOffset = srcBufferRegion.Offset(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); + m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes); } else @@ -524,7 +496,6 @@ namespace Dml // Source and destination for batched GPU -> CPU copies std::vector srcDatas; - std::vector srcStates; std::vector srcOffsets; std::vector dstDatas; std::vector dataSizesInBytes; @@ -557,21 +528,16 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(src[i]); - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); - srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); return S_OK; } @@ -941,9 +907,6 @@ namespace Dml std::vector srcDatas; srcDatas.reserve(src_dst_pairs.size()); - std::vector srcStates; - srcStates.reserve(src_dst_pairs.size()); - std::vector srcOffsets; srcOffsets.reserve(src_dst_pairs.size()); @@ -993,21 +956,16 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); - srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); return onnxruntime::common::Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 35955b113b2c1..b00b8f8e19f52 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -124,9 +124,7 @@ namespace Dml inputBufferRegions[i] = D3D12BufferRegion( 0, m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width, - m_nonOwnedGraphInputsFromInitializers[i].Get(), - nullptr, - nullptr); + m_nonOwnedGraphInputsFromInitializers[i].Get()); } else if (!m_isInputsUploadedByDmlEP[i]) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 1e7ebdc234c22..dde290f0bce0f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { auto allocInfo = static_cast(m_tensorData); - return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); } auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); @@ -1716,8 +1716,6 @@ namespace Windows::AI::MachineLearning::Adapter } } - // TODO (pavignol): Fix once we go back to a single resource - /* void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp) { if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) @@ -1769,9 +1767,9 @@ namespace Windows::AI::MachineLearning::Adapter } } - for (auto& tempAlloc : m_temporaryAbiAllocations) + for (auto& tempBuffer : m_temporaryBuffers) { - resourcesToTransition.push_back(tempAlloc.Get()); + resourcesToTransition.push_back(tempBuffer.ResourceInUavState()); } m_winmlProvider->TransitionResourcesForOperator( @@ -1780,7 +1778,6 @@ namespace Windows::AI::MachineLearning::Adapter resourcesToTransition.data()); } } - */ OpKernelContextWrapper::OpKernelContextWrapper( onnxruntime::OpKernelContext* context, @@ -1828,13 +1825,10 @@ namespace Windows::AI::MachineLearning::Adapter void OpKernelContextWrapper::Close() { - // TODO (pavignol): Fix once we go back to a single resource - /* if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) { TransitionResourcesForOperatorIfRequired(false); } - */ for (auto& tensors : m_inputTensors) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 647e0a17d26df..85b6b197fe511 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -512,9 +512,7 @@ class OpKernelContextWrapper : public WRL::Base const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams; // Get resources + auto inputBufferRegion = bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion; + auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion; auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion; auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion; + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[2]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(2, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get()); commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get()); @@ -764,6 +783,21 @@ class GpuDFTOperator : public WRL::Base chirpLength *= (m_isInverse ? 1 : -1); float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f; StockhamFFT(fft_params, true, chirpLength, scale, commandList); + + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(2, barriers); } void StockhamFFT( @@ -779,8 +813,27 @@ class GpuDFTOperator : public WRL::Base const auto& loopList = stockhamParams.ResourceLoopList; // Get input and output resources + auto inputBufferRegion = loopList[0].BufferRegion; + auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion; auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion; + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[2]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(2, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get()); commandList->SetPipelineState(m_stockhamFFTPipelineState.Get()); @@ -822,6 +875,21 @@ class GpuDFTOperator : public WRL::Base std::array uav_resources = { in, out, window }; Dispatch(uav_resources, constants, commandList); } + + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -838,6 +906,14 @@ class GpuDFTOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[TSize]; + + std::transform( + bufferRegions.begin(), bufferRegions.end(), + uav_barriers, + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + commandList->ResourceBarrier(TSize, uav_barriers); + for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -886,10 +962,7 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + commandList->ResourceBarrier(TSize, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 8863bd5362d27..0611c4b7bf7f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -683,6 +683,29 @@ class DmlGridSampleOperator : public WRL::Base Dml::GetDescendingPackedStrides(gridDims, gridStrides); Dml::GetDescendingPackedStrides(outputDims, outputStrides); + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[3]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + gridBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(3, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get()); commandList->SetPipelineState(m_gridSamplePipelineState.Get()); @@ -704,10 +727,26 @@ class DmlGridSampleOperator : public WRL::Base std::array uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion }; Dispatch(uavBufferRegions, constants, commandList); - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + gridBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(3, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -724,6 +763,14 @@ class DmlGridSampleOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[TSize]; + + std::transform( + bufferRegions.begin(), bufferRegions.end(), + uav_barriers, + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + commandList->ResourceBarrier(TSize, uav_barriers); + for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -772,10 +819,7 @@ class DmlGridSampleOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + commandList->ResourceBarrier(TSize, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index cd1f78e2a23a6..945b58965cf2f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -405,9 +405,15 @@ class DmlSTFTOperator : public WRL::Base std::array inputBindings; uint32_t inputBindingsCount = 1; + // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking + // barrierCount is outside the valid range. + D3D12_RESOURCE_BARRIER barriers[3]; + uint32_t barrierCount = 0; + Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Dml::D3D12BufferRegion windowBufferRegion; if (m_framingOperator.hasWindowTensor) @@ -415,6 +421,7 @@ class DmlSTFTOperator : public WRL::Base windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window); inputBuffers[1] = windowBufferRegion.GetBufferBinding(); inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); inputBindingsCount++; } @@ -422,6 +429,7 @@ class DmlSTFTOperator : public WRL::Base DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding(); DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); m_framingOperator.bindingTable->BindOutputs(1, &outputBinding); @@ -443,16 +451,22 @@ class DmlSTFTOperator : public WRL::Base m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc); } + // Transition resources COMMON -> UAV + commandList->ResourceBarrier(barrierCount, barriers); + m_framingOperator.commandRecorder->RecordDispatch( commandList, m_framingOperator.op.Get(), m_framingOperator.bindingTable.Get() ); - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + // Transition resources UAV -> COMMON + for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++) + { + std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter); + } + + commandList->ResourceBarrier(barrierCount, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index a91886c3b5863..5bb04ba4d30b5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -106,7 +106,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - gsl::span srcStates) + D3D12_RESOURCE_STATES srcState) { assert(dst.size() == src.size()); assert(dstSizes.size() == src.size()); @@ -134,7 +134,7 @@ namespace Dml D3D12_RESOURCE_STATE_COPY_DEST, src[i], srcOffsets[i], - srcStates[i], + srcState, dstSizes[i]); offset += dstSizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index f888f0a55ac48..4a65ce899d791 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -28,7 +28,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - gsl::span srcStates); + D3D12_RESOURCE_STATES srcState); private: void EnsureReadbackHeap(size_t size); From 587489d84711cdc47da2b2cda5952fe9bd2cb3c7 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 12 Jul 2023 08:18:16 -0700 Subject: [PATCH 44/76] Revert "Re-add "Remove aliasing"" This reverts commit 943ac58d7af0ceaa8aeee2c4f85f04d6bfcba1f8. --- .../src/DmlAllocationInfo.h | 10 +++ .../DmlExecutionProvider/src/DmlBuffer.cpp | 10 +++ .../dml/DmlExecutionProvider/src/DmlBuffer.h | 3 +- .../src/DmlBufferRegion.cpp | 76 ++++++++++++++--- .../src/DmlBufferRegion.h | 34 ++++++-- .../src/DmlCommandRecorder.cpp | 15 +++- .../src/DmlCommittedResourceWrapper.h | 6 ++ .../src/DmlGraphFusionHelper.cpp | 2 +- .../src/DmlReservedResourceSubAllocator.cpp | 4 +- .../src/DmlReservedResourceSubAllocator.h | 4 +- .../src/DmlReservedResourceWrapper.h | 6 ++ .../src/DmlResourceWrapper.h | 6 +- .../src/ExecutionContext.cpp | 3 + .../src/ExecutionProvider.cpp | 78 +++++++++++++----- .../src/FusedGraphKernel.cpp | 4 +- .../src/MLOperatorAuthorImpl.cpp | 12 ++- .../src/MLOperatorAuthorImpl.h | 4 +- .../src/Operators/DmlDFT.h | 81 +------------------ .../src/Operators/DmlGridSample.h | 60 ++------------ .../src/Operators/DmlSTFT.h | 22 +---- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 4 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 2 +- 22 files changed, 245 insertions(+), 201 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 7c11358bb106d..546a42342a2a0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -36,6 +36,16 @@ namespace Dml return m_resourceWrapper->GetUavResource(); } + ID3D12Resource* GetCopySrcResource() const + { + return m_resourceWrapper->GetCopySrcResource(); + } + + ID3D12Resource* GetCopyDstResource() const + { + return m_resourceWrapper->GetCopyDstResource(); + } + ComPtr DetachResourceWrapper() const { return std::move(m_resourceWrapper); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index 464ce26c16f54..c5fa576d24a0f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -48,6 +48,16 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const return buffer_region_.ResourceInUavState(); } +ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const +{ + return buffer_region_.ResourceInCopySrcState(); +} + +ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const +{ + return buffer_region_.ResourceInCopyDstState(); +} + uint64_t DmlBuffer::Offset() const { return buffer_region_ ? buffer_region_.Offset() : 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index 4b0dd58ce4467..b98ae727e1a65 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -27,8 +27,9 @@ class DmlBuffer DmlBuffer(DmlBuffer&&); DmlBuffer& operator=(DmlBuffer&&); - // TODO (pavignol): Rename to Resource() ID3D12Resource* ResourceInUavState() const; + ID3D12Resource* ResourceInCopySrcState() const; + ID3D12Resource* ResourceInCopyDstState() const; uint64_t Offset() const; uint64_t SizeInBytes() const; const D3D12BufferRegion& Region() const { return buffer_region_; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index c33cc5491c7f0..3240042b5b6a6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -7,64 +7,114 @@ namespace Dml { - D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource) - : m_resource(resource), + D3D12BufferRegion::D3D12BufferRegion( + uint64_t offset, + uint64_t size_in_bytes, + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state) + : resource_uav_state_(resource_uav_state), + resource_copy_src_state_(resource_copy_src_state), + resource_copy_dst_state_(resource_copy_dst_state), offset_(offset), size_in_bytes_(size_in_bytes) { - ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr); + // Get a raw pointer to the first non-null resource passed in. At least one + // resource must be provided. + first_valid_resource_ = resource_uav_state_; + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_src_state_; + } + if (!first_valid_resource_) + { + first_valid_resource_ = resource_copy_dst_state_; + } + ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr); // Regions cannot be empty. ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = m_resource->GetDesc().Width; + uint64_t buffer_size = first_valid_resource_->GetDesc().Width; ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); // All three resources, if provided, must be identical aside from state. - assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER); - assert(m_resource->GetDesc().Width == buffer_size); + assert( + first_valid_resource_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER); + assert( + !resource_uav_state || + (resource_uav_state->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_uav_state->GetDesc().Width == buffer_size)); + assert( + !resource_copy_src_state_ || + (resource_copy_src_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_src_state_->GetDesc().Width == buffer_size)); + assert( + !resource_copy_dst_state_ || + (resource_copy_dst_state_->GetDesc().Dimension == + D3D12_RESOURCE_DIMENSION_BUFFER && + resource_copy_dst_state_->GetDesc().Width == buffer_size)); } D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) { - std::swap(this->m_resource, that.m_resource); + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); } D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) { - std::swap(this->m_resource, that.m_resource); + std::swap(this->resource_uav_state_, that.resource_uav_state_); + std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); + std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->first_valid_resource_, that.first_valid_resource_); return *this; } ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const { - return m_resource; + return resource_uav_state_; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const + { + return resource_copy_src_state_; + } + + ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const + { + return resource_copy_dst_state_; } uint64_t D3D12BufferRegion::Offset() const { - return m_resource ? offset_ : 0; + return first_valid_resource_ ? offset_ : 0; } uint64_t D3D12BufferRegion::SizeInBytes() const { - return m_resource ? size_in_bytes_ : 0; + return first_valid_resource_ ? size_in_bytes_ : 0; } DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const { - if (!m_resource) + if (!resource_uav_state_) { return DML_BUFFER_BINDING{}; } - return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_}; + return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index 6c5cb37297caa..dee01a29fe55f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -19,39 +19,61 @@ namespace Dml D3D12BufferRegion( uint64_t offset, uint64_t size_in_bytes, - ID3D12Resource* resource); + ID3D12Resource* resource_uav_state, + ID3D12Resource* resource_copy_src_state, + ID3D12Resource* resource_copy_dst_state); // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; D3D12BufferRegion(D3D12BufferRegion&&); D3D12BufferRegion& operator=(D3D12BufferRegion&&); + ID3D12Resource* ResourceInUavState() const; + // NOTE: may be any state that is valid as a copy source (COPY_SRC, + // GENERIC_READ, or COMMON). + ID3D12Resource* ResourceInCopySrcState() const; + + ID3D12Resource* ResourceInCopyDstState() const; + uint64_t Offset() const; uint64_t SizeInBytes() const; DML_BUFFER_BINDING GetBufferBinding() const; - explicit operator bool() const { return m_resource != nullptr; } + explicit operator bool() const { return first_valid_resource_ != nullptr; } // Creates a subregion at an offset from the start of this region. If no // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const + inline D3D12BufferRegion Subregion( + uint64_t offset, + uint64_t size_in_bytes = 0) const { // start of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); - size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + size_in_bytes = + size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; // end of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); - return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource); + return D3D12BufferRegion( + offset_ + offset, + size_in_bytes, + resource_uav_state_, + resource_copy_src_state_, + resource_copy_dst_state_); } private: - ID3D12Resource* m_resource = nullptr; + ID3D12Resource* resource_uav_state_ = nullptr; + ID3D12Resource* resource_copy_src_state_ = nullptr; + ID3D12Resource* resource_copy_dst_state_ = nullptr; uint64_t offset_ = 0; uint64_t size_in_bytes_ = 0; + + // Pointer to the first resource above that isn't null. + ID3D12Resource* first_valid_resource_ = nullptr; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 862884c22b08c..af625334b7720 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -94,8 +94,10 @@ void DmlCommandRecorder::InitializeOperator( if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) || (temporaryResourceSize > 0)) { - auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); - m_currentCommandList->ResourceBarrier(1, &uav); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); } } @@ -154,8 +156,13 @@ void DmlCommandRecorder::ExecuteOperator( // Barrier all outputs. #pragma warning(push) #pragma warning(disable: 6387) - auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); - m_currentCommandList->ResourceBarrier(1, &uav); + + // Barrier all outputs. + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); + #pragma warning(pop) } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index 4b9c167dfe671..f786cca837f06 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -12,6 +12,12 @@ namespace Dml // Committed resources use the same resource for all states and use barriers to transition between states ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); } + + D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index dcf6b8607f319..cd6b241e70d48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper // The allocation is not pooled auto allocInfo = static_cast(opaqueData); *allocId = 0; - return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); + return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); } auto taggedPointer = TaggedPointer::Unpack(opaqueData); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index 0dc07384ea905..c82e0a4f5d722 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -400,7 +400,9 @@ namespace Dml return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, - it->second->GetUavResource()); + it->second->GetUavResource(), + it->second->GetCopySrcResource(), + it->second->GetCopyDstResource()); } AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 8049848c8671e..1d7c8704ab7da 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -60,7 +60,9 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion( + const TaggedPointer& taggedPointer, + uint64_t size_in_bytes); AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 22f8cbbdc394b..68feab568ca45 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -18,6 +18,12 @@ namespace Dml } ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } + ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } + ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } + + D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } + D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; } + D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 2b1a8e5c726dc..03e9f762b7eb4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,8 +11,12 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: - // TODO (pavignol): Rename to GetResource() virtual ID3D12Resource* GetUavResource() const = 0; + virtual ID3D12Resource* GetCopySrcResource() const = 0; + virtual ID3D12Resource* GetCopyDstResource() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0; + virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index b5492a1a86ea3..d6a46e354c769 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -100,6 +100,8 @@ namespace Dml barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); } + // Since this copy may write to GPU memory, we also need to perform an aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); // Keep the intermediate buffer alive until we're done with it @@ -133,6 +135,7 @@ namespace Dml // Since this copy may write to GPU memory, we also need to perform an // aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); */ } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index b06d23adf5886..f72035f5e5fda 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -443,8 +443,15 @@ namespace Dml // CPU -> GPU copy (upload) // auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); - const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + + ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? dstBufferRegion.ResourceInUavState() + : dstBufferRegion.ResourceInCopyDstState(); + + const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; + const uint64_t dstOffset = dstBufferRegion.Offset(); m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); FlushUploadsIfReady(); @@ -455,26 +462,47 @@ namespace Dml // GPU -> CPU copy (readback) // auto srcBufferRegion = GetBufferForTensor(src); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; + const uint64_t srcOffset = srcBufferRegion.Offset(); m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); } else if (!src->IsCpuData() && !dst->IsCpuData()) { + printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); + // // GPU -> GPU copy // auto srcBufferRegion = GetBufferForTensor(src); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - const uint64_t srcOffset = srcBufferRegion.Offset(); + + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); - const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - const uint64_t dstOffset = dstBufferRegion.Offset(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? dstBufferRegion.ResourceInUavState() + : dstBufferRegion.ResourceInCopyDstState(); + + const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_DEST; + + const uint64_t srcOffset = srcBufferRegion.Offset(); + const uint64_t dstOffset = dstBufferRegion.Offset(); m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes); } else @@ -496,6 +524,7 @@ namespace Dml // Source and destination for batched GPU -> CPU copies std::vector srcDatas; + std::vector srcStates; std::vector srcOffsets; std::vector dstDatas; std::vector dataSizesInBytes; @@ -528,16 +557,21 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(src[i]); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; srcDatas.push_back(srcData); + srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); return S_OK; } @@ -907,6 +941,9 @@ namespace Dml std::vector srcDatas; srcDatas.reserve(src_dst_pairs.size()); + std::vector srcStates; + srcStates.reserve(src_dst_pairs.size()); + std::vector srcOffsets; srcOffsets.reserve(src_dst_pairs.size()); @@ -956,16 +993,21 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? srcBufferRegion.ResourceInUavState() + : srcBufferRegion.ResourceInCopySrcState(); + + const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr + ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS + : D3D12_RESOURCE_STATE_COPY_SOURCE; srcDatas.push_back(srcData); + srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); return onnxruntime::common::Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index b00b8f8e19f52..35955b113b2c1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -124,7 +124,9 @@ namespace Dml inputBufferRegions[i] = D3D12BufferRegion( 0, m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width, - m_nonOwnedGraphInputsFromInitializers[i].Get()); + m_nonOwnedGraphInputsFromInitializers[i].Get(), + nullptr, + nullptr); } else if (!m_isInputsUploadedByDmlEP[i]) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index dde290f0bce0f..1e7ebdc234c22 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { auto allocInfo = static_cast(m_tensorData); - return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); + return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); } auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); @@ -1716,6 +1716,8 @@ namespace Windows::AI::MachineLearning::Adapter } } + // TODO (pavignol): Fix once we go back to a single resource + /* void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp) { if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) @@ -1767,9 +1769,9 @@ namespace Windows::AI::MachineLearning::Adapter } } - for (auto& tempBuffer : m_temporaryBuffers) + for (auto& tempAlloc : m_temporaryAbiAllocations) { - resourcesToTransition.push_back(tempBuffer.ResourceInUavState()); + resourcesToTransition.push_back(tempAlloc.Get()); } m_winmlProvider->TransitionResourcesForOperator( @@ -1778,6 +1780,7 @@ namespace Windows::AI::MachineLearning::Adapter resourcesToTransition.data()); } } + */ OpKernelContextWrapper::OpKernelContextWrapper( onnxruntime::OpKernelContext* context, @@ -1825,10 +1828,13 @@ namespace Windows::AI::MachineLearning::Adapter void OpKernelContextWrapper::Close() { + // TODO (pavignol): Fix once we go back to a single resource + /* if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) { TransitionResourcesForOperatorIfRequired(false); } + */ for (auto& tensors : m_inputTensors) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 85b6b197fe511..647e0a17d26df 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -512,7 +512,9 @@ class OpKernelContextWrapper : public WRL::Base const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams; // Get resources - auto inputBufferRegion = bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion; - auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion; auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion; auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion; - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[2]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(2, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get()); commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get()); @@ -783,21 +764,6 @@ class GpuDFTOperator : public WRL::Base chirpLength *= (m_isInverse ? 1 : -1); float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f; StockhamFFT(fft_params, true, chirpLength, scale, commandList); - - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(2, barriers); } void StockhamFFT( @@ -813,27 +779,8 @@ class GpuDFTOperator : public WRL::Base const auto& loopList = stockhamParams.ResourceLoopList; // Get input and output resources - auto inputBufferRegion = loopList[0].BufferRegion; - auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion; auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion; - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[2]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(2, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get()); commandList->SetPipelineState(m_stockhamFFTPipelineState.Get()); @@ -875,21 +822,6 @@ class GpuDFTOperator : public WRL::Base std::array uav_resources = { in, out, window }; Dispatch(uav_resources, constants, commandList); } - - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -906,14 +838,6 @@ class GpuDFTOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[TSize]; - - std::transform( - bufferRegions.begin(), bufferRegions.end(), - uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); - commandList->ResourceBarrier(TSize, uav_barriers); - for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -962,7 +886,10 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(TSize, uav_barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 0611c4b7bf7f7..8863bd5362d27 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -683,29 +683,6 @@ class DmlGridSampleOperator : public WRL::Base Dml::GetDescendingPackedStrides(gridDims, gridStrides); Dml::GetDescendingPackedStrides(outputDims, outputStrides); - // Transition resources from common to UAV state - D3D12_RESOURCE_BARRIER barriers[3]; - - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_COMMON, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS - ); - - commandList->ResourceBarrier(3, barriers); - // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get()); commandList->SetPipelineState(m_gridSamplePipelineState.Get()); @@ -727,26 +704,10 @@ class DmlGridSampleOperator : public WRL::Base std::array uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion }; Dispatch(uavBufferRegions, constants, commandList); - // Transition resources to common state - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - D3D12_RESOURCE_STATE_COMMON - ); - - commandList->ResourceBarrier(3, barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -763,14 +724,6 @@ class DmlGridSampleOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { - D3D12_RESOURCE_BARRIER uav_barriers[TSize]; - - std::transform( - bufferRegions.begin(), bufferRegions.end(), - uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); - commandList->ResourceBarrier(TSize, uav_barriers); - for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -819,7 +772,10 @@ class DmlGridSampleOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - commandList->ResourceBarrier(TSize, uav_barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index 945b58965cf2f..cd1f78e2a23a6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -405,15 +405,9 @@ class DmlSTFTOperator : public WRL::Base std::array inputBindings; uint32_t inputBindingsCount = 1; - // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking - // barrierCount is outside the valid range. - D3D12_RESOURCE_BARRIER barriers[3]; - uint32_t barrierCount = 0; - Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Dml::D3D12BufferRegion windowBufferRegion; if (m_framingOperator.hasWindowTensor) @@ -421,7 +415,6 @@ class DmlSTFTOperator : public WRL::Base windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window); inputBuffers[1] = windowBufferRegion.GetBufferBinding(); inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); inputBindingsCount++; } @@ -429,7 +422,6 @@ class DmlSTFTOperator : public WRL::Base DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding(); DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); m_framingOperator.bindingTable->BindOutputs(1, &outputBinding); @@ -451,22 +443,16 @@ class DmlSTFTOperator : public WRL::Base m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc); } - // Transition resources COMMON -> UAV - commandList->ResourceBarrier(barrierCount, barriers); - m_framingOperator.commandRecorder->RecordDispatch( commandList, m_framingOperator.op.Get(), m_framingOperator.bindingTable.Get() ); - // Transition resources UAV -> COMMON - for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++) - { - std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter); - } - - commandList->ResourceBarrier(barrierCount, barriers); + D3D12_RESOURCE_BARRIER barriers[] = { + CD3DX12_RESOURCE_BARRIER::UAV(nullptr), + CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; + commandList->ResourceBarrier(2, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index 5bb04ba4d30b5..a91886c3b5863 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -106,7 +106,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - D3D12_RESOURCE_STATES srcState) + gsl::span srcStates) { assert(dst.size() == src.size()); assert(dstSizes.size() == src.size()); @@ -134,7 +134,7 @@ namespace Dml D3D12_RESOURCE_STATE_COPY_DEST, src[i], srcOffsets[i], - srcState, + srcStates[i], dstSizes[i]); offset += dstSizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index 4a65ce899d791..f888f0a55ac48 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -28,7 +28,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - D3D12_RESOURCE_STATES srcState); + gsl::span srcStates); private: void EnsureReadbackHeap(size_t size); From 57d2f46c7e55ad6cf067934a3f208abe955db67c Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 12 Jul 2023 12:59:47 -0700 Subject: [PATCH 45/76] Remove aliasing This reverts commit 587489d84711cdc47da2b2cda5952fe9bd2cb3c7. --- .../src/DmlAllocationInfo.h | 10 --- .../DmlExecutionProvider/src/DmlBuffer.cpp | 10 --- .../dml/DmlExecutionProvider/src/DmlBuffer.h | 3 +- .../src/DmlBufferRegion.cpp | 76 +++-------------- .../src/DmlBufferRegion.h | 34 ++------ .../src/DmlCommandRecorder.cpp | 15 +--- .../src/DmlCommittedResourceWrapper.h | 6 -- .../src/DmlGraphFusionHelper.cpp | 2 +- .../src/DmlReservedResourceSubAllocator.cpp | 4 +- .../src/DmlReservedResourceSubAllocator.h | 4 +- .../src/DmlReservedResourceWrapper.h | 6 -- .../src/DmlResourceWrapper.h | 6 +- .../src/ExecutionContext.cpp | 3 - .../src/ExecutionProvider.cpp | 78 +++++------------- .../src/FusedGraphKernel.cpp | 4 +- .../src/MLOperatorAuthorImpl.cpp | 12 +-- .../src/MLOperatorAuthorImpl.h | 4 +- .../src/Operators/DmlDFT.h | 81 ++++++++++++++++++- .../src/Operators/DmlGridSample.h | 60 ++++++++++++-- .../src/Operators/DmlSTFT.h | 22 ++++- .../DmlExecutionProvider/src/ReadbackHeap.cpp | 4 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 2 +- 22 files changed, 201 insertions(+), 245 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 546a42342a2a0..7c11358bb106d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -36,16 +36,6 @@ namespace Dml return m_resourceWrapper->GetUavResource(); } - ID3D12Resource* GetCopySrcResource() const - { - return m_resourceWrapper->GetCopySrcResource(); - } - - ID3D12Resource* GetCopyDstResource() const - { - return m_resourceWrapper->GetCopyDstResource(); - } - ComPtr DetachResourceWrapper() const { return std::move(m_resourceWrapper); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index c5fa576d24a0f..464ce26c16f54 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -48,16 +48,6 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const return buffer_region_.ResourceInUavState(); } -ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const -{ - return buffer_region_.ResourceInCopySrcState(); -} - -ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const -{ - return buffer_region_.ResourceInCopyDstState(); -} - uint64_t DmlBuffer::Offset() const { return buffer_region_ ? buffer_region_.Offset() : 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index b98ae727e1a65..4b0dd58ce4467 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -27,9 +27,8 @@ class DmlBuffer DmlBuffer(DmlBuffer&&); DmlBuffer& operator=(DmlBuffer&&); + // TODO (pavignol): Rename to Resource() ID3D12Resource* ResourceInUavState() const; - ID3D12Resource* ResourceInCopySrcState() const; - ID3D12Resource* ResourceInCopyDstState() const; uint64_t Offset() const; uint64_t SizeInBytes() const; const D3D12BufferRegion& Region() const { return buffer_region_; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index 3240042b5b6a6..c33cc5491c7f0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -7,114 +7,64 @@ namespace Dml { - D3D12BufferRegion::D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state) - : resource_uav_state_(resource_uav_state), - resource_copy_src_state_(resource_copy_src_state), - resource_copy_dst_state_(resource_copy_dst_state), + D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource) + : m_resource(resource), offset_(offset), size_in_bytes_(size_in_bytes) { - // Get a raw pointer to the first non-null resource passed in. At least one - // resource must be provided. - first_valid_resource_ = resource_uav_state_; - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_src_state_; - } - if (!first_valid_resource_) - { - first_valid_resource_ = resource_copy_dst_state_; - } - ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr); + ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr); // Regions cannot be empty. ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = first_valid_resource_->GetDesc().Width; + uint64_t buffer_size = m_resource->GetDesc().Width; ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); // All three resources, if provided, must be identical aside from state. - assert( - first_valid_resource_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER); - assert( - !resource_uav_state || - (resource_uav_state->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_uav_state->GetDesc().Width == buffer_size)); - assert( - !resource_copy_src_state_ || - (resource_copy_src_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_src_state_->GetDesc().Width == buffer_size)); - assert( - !resource_copy_dst_state_ || - (resource_copy_dst_state_->GetDesc().Dimension == - D3D12_RESOURCE_DIMENSION_BUFFER && - resource_copy_dst_state_->GetDesc().Width == buffer_size)); + assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER); + assert(m_resource->GetDesc().Width == buffer_size); } D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) { - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); } D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) { - std::swap(this->resource_uav_state_, that.resource_uav_state_); - std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_); - std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_); + std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); - std::swap(this->first_valid_resource_, that.first_valid_resource_); return *this; } ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const { - return resource_uav_state_; - } - - ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const - { - return resource_copy_src_state_; - } - - ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const - { - return resource_copy_dst_state_; + return m_resource; } uint64_t D3D12BufferRegion::Offset() const { - return first_valid_resource_ ? offset_ : 0; + return m_resource ? offset_ : 0; } uint64_t D3D12BufferRegion::SizeInBytes() const { - return first_valid_resource_ ? size_in_bytes_ : 0; + return m_resource ? size_in_bytes_ : 0; } DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const { - if (!resource_uav_state_) + if (!m_resource) { return DML_BUFFER_BINDING{}; } - return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_}; + return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_}; } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index dee01a29fe55f..6c5cb37297caa 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -19,61 +19,39 @@ namespace Dml D3D12BufferRegion( uint64_t offset, uint64_t size_in_bytes, - ID3D12Resource* resource_uav_state, - ID3D12Resource* resource_copy_src_state, - ID3D12Resource* resource_copy_dst_state); + ID3D12Resource* resource); // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; D3D12BufferRegion(D3D12BufferRegion&&); D3D12BufferRegion& operator=(D3D12BufferRegion&&); - ID3D12Resource* ResourceInUavState() const; - // NOTE: may be any state that is valid as a copy source (COPY_SRC, - // GENERIC_READ, or COMMON). - ID3D12Resource* ResourceInCopySrcState() const; - - ID3D12Resource* ResourceInCopyDstState() const; - uint64_t Offset() const; uint64_t SizeInBytes() const; DML_BUFFER_BINDING GetBufferBinding() const; - explicit operator bool() const { return first_valid_resource_ != nullptr; } + explicit operator bool() const { return m_resource != nullptr; } // Creates a subregion at an offset from the start of this region. If no // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion( - uint64_t offset, - uint64_t size_in_bytes = 0) const + inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const { // start of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); - size_in_bytes = - size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; // end of subregion must be within current region ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); - return D3D12BufferRegion( - offset_ + offset, - size_in_bytes, - resource_uav_state_, - resource_copy_src_state_, - resource_copy_dst_state_); + return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource); } private: - ID3D12Resource* resource_uav_state_ = nullptr; - ID3D12Resource* resource_copy_src_state_ = nullptr; - ID3D12Resource* resource_copy_dst_state_ = nullptr; + ID3D12Resource* m_resource = nullptr; uint64_t offset_ = 0; uint64_t size_in_bytes_ = 0; - - // Pointer to the first resource above that isn't null. - ID3D12Resource* first_valid_resource_ = nullptr; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index af625334b7720..862884c22b08c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -94,10 +94,8 @@ void DmlCommandRecorder::InitializeOperator( if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) || (temporaryResourceSize > 0)) { - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); + auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); + m_currentCommandList->ResourceBarrier(1, &uav); } } @@ -156,13 +154,8 @@ void DmlCommandRecorder::ExecuteOperator( // Barrier all outputs. #pragma warning(push) #pragma warning(disable: 6387) - - // Barrier all outputs. - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers); - + auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr); + m_currentCommandList->ResourceBarrier(1, &uav); #pragma warning(pop) } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index f786cca837f06..4b9c167dfe671 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -12,12 +12,6 @@ namespace Dml // Committed resources use the same resource for all states and use barriers to transition between states ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); } - ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); } - - D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index cd6b241e70d48..dcf6b8607f319 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper // The allocation is not pooled auto allocInfo = static_cast(opaqueData); *allocId = 0; - return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); } auto taggedPointer = TaggedPointer::Unpack(opaqueData); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index c82e0a4f5d722..0dc07384ea905 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -400,9 +400,7 @@ namespace Dml return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, - it->second->GetUavResource(), - it->second->GetCopySrcResource(), - it->second->GetCopyDstResource()); + it->second->GetUavResource()); } AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 1d7c8704ab7da..8049848c8671e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -60,9 +60,7 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion( - const TaggedPointer& taggedPointer, - uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 68feab568ca45..22f8cbbdc394b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -18,12 +18,6 @@ namespace Dml } ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } - ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); } - ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); } - - D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; } - D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; } - D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 03e9f762b7eb4..2b1a8e5c726dc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,12 +11,8 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: + // TODO (pavignol): Rename to GetResource() virtual ID3D12Resource* GetUavResource() const = 0; - virtual ID3D12Resource* GetCopySrcResource() const = 0; - virtual ID3D12Resource* GetCopyDstResource() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0; - virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index d6a46e354c769..b5492a1a86ea3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -100,8 +100,6 @@ namespace Dml barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); } - // Since this copy may write to GPU memory, we also need to perform an aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); // Keep the intermediate buffer alive until we're done with it @@ -135,7 +133,6 @@ namespace Dml // Since this copy may write to GPU memory, we also need to perform an // aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); m_dmlRecorder.ResourceBarrier(barriers); */ } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index f72035f5e5fda..b06d23adf5886 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -443,15 +443,8 @@ namespace Dml // CPU -> GPU copy (upload) // auto dstBufferRegion = GetBufferForTensor(dst); - - ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? dstBufferRegion.ResourceInUavState() - : dstBufferRegion.ResourceInCopyDstState(); - - const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; - + ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); FlushUploadsIfReady(); @@ -462,47 +455,26 @@ namespace Dml // GPU -> CPU copy (readback) // auto srcBufferRegion = GetBufferForTensor(src); - - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; - + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t srcOffset = srcBufferRegion.Offset(); m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); } else if (!src->IsCpuData() && !dst->IsCpuData()) { - printf("*****************DmlCommandRecorder::CopyBufferRegion\n"); - // // GPU -> GPU copy // auto srcBufferRegion = GetBufferForTensor(src); - - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + const uint64_t srcOffset = srcBufferRegion.Offset(); auto dstBufferRegion = GetBufferForTensor(dst); - - ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? dstBufferRegion.ResourceInUavState() - : dstBufferRegion.ResourceInCopyDstState(); - - const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_DEST; - - const uint64_t srcOffset = srcBufferRegion.Offset(); + ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); + m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes); } else @@ -524,7 +496,6 @@ namespace Dml // Source and destination for batched GPU -> CPU copies std::vector srcDatas; - std::vector srcStates; std::vector srcOffsets; std::vector dstDatas; std::vector dataSizesInBytes; @@ -557,21 +528,16 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(src[i]); - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); - srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); return S_OK; } @@ -941,9 +907,6 @@ namespace Dml std::vector srcDatas; srcDatas.reserve(src_dst_pairs.size()); - std::vector srcStates; - srcStates.reserve(src_dst_pairs.size()); - std::vector srcOffsets; srcOffsets.reserve(src_dst_pairs.size()); @@ -993,21 +956,16 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? srcBufferRegion.ResourceInUavState() - : srcBufferRegion.ResourceInCopySrcState(); - - const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr - ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : D3D12_RESOURCE_STATE_COPY_SOURCE; + ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); - srcStates.push_back(srcState); srcOffsets.push_back(srcBufferRegion.Offset()); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates); + const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); return onnxruntime::common::Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 35955b113b2c1..b00b8f8e19f52 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -124,9 +124,7 @@ namespace Dml inputBufferRegions[i] = D3D12BufferRegion( 0, m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width, - m_nonOwnedGraphInputsFromInitializers[i].Get(), - nullptr, - nullptr); + m_nonOwnedGraphInputsFromInitializers[i].Get()); } else if (!m_isInputsUploadedByDmlEP[i]) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 1e7ebdc234c22..dde290f0bce0f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { auto allocInfo = static_cast(m_tensorData); - return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr); + return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); } auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); @@ -1716,8 +1716,6 @@ namespace Windows::AI::MachineLearning::Adapter } } - // TODO (pavignol): Fix once we go back to a single resource - /* void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp) { if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) @@ -1769,9 +1767,9 @@ namespace Windows::AI::MachineLearning::Adapter } } - for (auto& tempAlloc : m_temporaryAbiAllocations) + for (auto& tempBuffer : m_temporaryBuffers) { - resourcesToTransition.push_back(tempAlloc.Get()); + resourcesToTransition.push_back(tempBuffer.ResourceInUavState()); } m_winmlProvider->TransitionResourcesForOperator( @@ -1780,7 +1778,6 @@ namespace Windows::AI::MachineLearning::Adapter resourcesToTransition.data()); } } - */ OpKernelContextWrapper::OpKernelContextWrapper( onnxruntime::OpKernelContext* context, @@ -1828,13 +1825,10 @@ namespace Windows::AI::MachineLearning::Adapter void OpKernelContextWrapper::Close() { - // TODO (pavignol): Fix once we go back to a single resource - /* if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator)) { TransitionResourcesForOperatorIfRequired(false); } - */ for (auto& tensors : m_inputTensors) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 647e0a17d26df..85b6b197fe511 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -512,9 +512,7 @@ class OpKernelContextWrapper : public WRL::Base const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams; // Get resources + auto inputBufferRegion = bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion; + auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion; auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion; auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion; + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[2]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(2, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get()); commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get()); @@ -764,6 +783,21 @@ class GpuDFTOperator : public WRL::Base chirpLength *= (m_isInverse ? 1 : -1); float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f; StockhamFFT(fft_params, true, chirpLength, scale, commandList); + + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(2, barriers); } void StockhamFFT( @@ -779,8 +813,27 @@ class GpuDFTOperator : public WRL::Base const auto& loopList = stockhamParams.ResourceLoopList; // Get input and output resources + auto inputBufferRegion = loopList[0].BufferRegion; + auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion; auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion; + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[2]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(2, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get()); commandList->SetPipelineState(m_stockhamFFTPipelineState.Get()); @@ -822,6 +875,21 @@ class GpuDFTOperator : public WRL::Base std::array uav_resources = { in, out, window }; Dispatch(uav_resources, constants, commandList); } + + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(2, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -838,6 +906,14 @@ class GpuDFTOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[TSize]; + + std::transform( + bufferRegions.begin(), bufferRegions.end(), + uav_barriers, + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + commandList->ResourceBarrier(TSize, uav_barriers); + for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -886,10 +962,7 @@ class GpuDFTOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + commandList->ResourceBarrier(TSize, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 8863bd5362d27..0611c4b7bf7f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -683,6 +683,29 @@ class DmlGridSampleOperator : public WRL::Base Dml::GetDescendingPackedStrides(gridDims, gridStrides); Dml::GetDescendingPackedStrides(outputDims, outputStrides); + // Transition resources from common to UAV state + D3D12_RESOURCE_BARRIER barriers[3]; + + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + gridBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS + ); + + commandList->ResourceBarrier(3, barriers); + // Set the root signature and pipeline state commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get()); commandList->SetPipelineState(m_gridSamplePipelineState.Get()); @@ -704,10 +727,26 @@ class DmlGridSampleOperator : public WRL::Base std::array uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion }; Dispatch(uavBufferRegions, constants, commandList); - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + // Transition resources to common state + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( + inputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( + gridBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( + outputBufferRegion.ResourceInUavState(), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON + ); + + commandList->ResourceBarrier(3, barriers); } std::vector GetTensorDimensions(IMLOperatorTensor* tensor) @@ -724,6 +763,14 @@ class DmlGridSampleOperator : public WRL::Base TConstants& constants, ID3D12GraphicsCommandList* commandList) { + D3D12_RESOURCE_BARRIER uav_barriers[TSize]; + + std::transform( + bufferRegions.begin(), bufferRegions.end(), + uav_barriers, + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + commandList->ResourceBarrier(TSize, uav_barriers); + for (uint32_t i = 0; i < TSize; i++) { // Set resource views @@ -772,10 +819,7 @@ class DmlGridSampleOperator : public WRL::Base commandList->Dispatch(dispatchSizeX, 1, 1); } - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + commandList->ResourceBarrier(TSize, uav_barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index cd1f78e2a23a6..945b58965cf2f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -405,9 +405,15 @@ class DmlSTFTOperator : public WRL::Base std::array inputBindings; uint32_t inputBindingsCount = 1; + // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking + // barrierCount is outside the valid range. + D3D12_RESOURCE_BARRIER barriers[3]; + uint32_t barrierCount = 0; + Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Dml::D3D12BufferRegion windowBufferRegion; if (m_framingOperator.hasWindowTensor) @@ -415,6 +421,7 @@ class DmlSTFTOperator : public WRL::Base windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window); inputBuffers[1] = windowBufferRegion.GetBufferBinding(); inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); inputBindingsCount++; } @@ -422,6 +429,7 @@ class DmlSTFTOperator : public WRL::Base DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding(); DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer }; + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); m_framingOperator.bindingTable->BindOutputs(1, &outputBinding); @@ -443,16 +451,22 @@ class DmlSTFTOperator : public WRL::Base m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc); } + // Transition resources COMMON -> UAV + commandList->ResourceBarrier(barrierCount, barriers); + m_framingOperator.commandRecorder->RecordDispatch( commandList, m_framingOperator.op.Get(), m_framingOperator.bindingTable.Get() ); - D3D12_RESOURCE_BARRIER barriers[] = { - CD3DX12_RESOURCE_BARRIER::UAV(nullptr), - CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)}; - commandList->ResourceBarrier(2, barriers); + // Transition resources UAV -> COMMON + for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++) + { + std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter); + } + + commandList->ResourceBarrier(barrierCount, barriers); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index a91886c3b5863..5bb04ba4d30b5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -106,7 +106,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - gsl::span srcStates) + D3D12_RESOURCE_STATES srcState) { assert(dst.size() == src.size()); assert(dstSizes.size() == src.size()); @@ -134,7 +134,7 @@ namespace Dml D3D12_RESOURCE_STATE_COPY_DEST, src[i], srcOffsets[i], - srcStates[i], + srcState, dstSizes[i]); offset += dstSizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index f888f0a55ac48..4a65ce899d791 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -28,7 +28,7 @@ namespace Dml gsl::span dstSizes, gsl::span src, gsl::span srcOffsets, - gsl::span srcStates); + D3D12_RESOURCE_STATES srcState); private: void EnsureReadbackHeap(size_t size); From 7440e74dd34f5841b0860f42f4a0f02c0ca01e98 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 12 Jul 2023 13:23:44 -0700 Subject: [PATCH 46/76] Fix mish test failure --- .../src/DmlHeapAllocation.h | 11 -- .../src/DmlReservedResourceSubAllocator.cpp | 107 ++++------- .../src/ExecutionContext.cpp | 169 +++++++++--------- 3 files changed, 120 insertions(+), 167 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h index 6de78a47b6d8b..ab75b7d322120 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h @@ -13,17 +13,6 @@ namespace Dml // an allocation may comprise multiple heaps. If tiling is not supported // an allocation will only have a single heap. std::vector> heaps; - - // Resources created over this allocation's heaps. All three resources - // are identical aside from being fixed in a single resource state: UAV, - // COPY_SRC, and COPY_DST respectively. The purpose of duplicate - // resources is to enable overlapping resources in different states for - // copying data. Most callers will not (and should not) interact - // directly with these resources; all three are wrapped by the buffer - // regions returned from this allocator, and the appropriate resource - // will be used automatically when performing buffer copies. Microsoft::WRL::ComPtr resource_uav_state; - Microsoft::WRL::ComPtr resource_copy_src_state; - Microsoft::WRL::ComPtr resource_copy_dst_state; }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index 0dc07384ea905..bc9a0b15e86fe 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -95,41 +95,24 @@ namespace Dml // The allocation may be larger than the requested size to ensure a whole // number of tiles. - const uint64_t resource_size_in_tiles = - 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - const uint64_t resource_size_in_bytes = - resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - auto resource_desc = - CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_); + const uint64_t resource_size_in_tiles = 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resource_size_in_bytes = resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + auto resource_desc = CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_); - ID3D12Resource** resources[] = { - &allocation.resource_uav_state, - &allocation.resource_copy_src_state, - &allocation.resource_copy_dst_state}; - - D3D12_RESOURCE_STATES states[] = { + HRESULT create_resource_hr = m_device->CreateReservedResource( + &resource_desc, initial_state_, - D3D12_RESOURCE_STATE_COPY_SOURCE, - D3D12_RESOURCE_STATE_COPY_DEST}; + nullptr, + IID_PPV_ARGS(&allocation.resource_uav_state)); - for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) + if (create_resource_hr == E_OUTOFMEMORY) { - HRESULT create_resource_hr = m_device->CreateReservedResource( - &resource_desc, - states[i], - nullptr, - IID_PPV_ARGS(resources[i])); - - if (create_resource_hr == E_OUTOFMEMORY) - { - return absl::nullopt; - } - ORT_THROW_IF_FAILED(create_resource_hr); + return absl::nullopt; } + ORT_THROW_IF_FAILED(create_resource_hr); // Reserve enough heaps to store all tiles in the resource. - const uint64_t heap_count = - 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_; + const uint64_t heap_count = 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_; allocation.heaps.resize(heap_count); // Create heaps and map them to the primary reserved resource. @@ -175,28 +158,17 @@ namespace Dml // guaranteed to be set (on the GPU timeline) by the time any code can // reference the returned resource. We only execute operations on a // single hardware queue so there is no need to wait or signal. - // - // All resources have identical tile mappings. The repeated call to - // UpdateTileMappings on all resources instead of using CopyTileMappings - // is intentional: the latter API is not supported by all versions of - // PIX. - for (auto resource : - {allocation.resource_uav_state.Get(), - allocation.resource_copy_src_state.Get(), - allocation.resource_copy_dst_state.Get()}) - { - queue_->UpdateTileMappings( - resource, - numResourceRegions, - &resource_region_start_coordinates, - &resource_region_size, - allocation.heaps[i].Get(), - numHeapRanges, - &tile_range_flags, - &heap_range_start_offset, - &heap_range_tile_count, - D3D12_TILE_MAPPING_FLAG_NONE); - } + queue_->UpdateTileMappings( + allocation.resource_uav_state.Get(), + numResourceRegions, + &resource_region_start_coordinates, + &resource_region_size, + allocation.heaps[i].Get(), + numHeapRanges, + &tile_range_flags, + &heap_range_start_offset, + &heap_range_tile_count, + D3D12_TILE_MAPPING_FLAG_NONE); resource_region_start_coordinates.X += static_cast(heap_size_in_tiles); unmapped_resource_tiles -= heap_size_in_tiles; @@ -225,33 +197,20 @@ namespace Dml } // Create large placed resource that spans the heap. - D3D12_RESOURCE_DESC resource_desc = - CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_); - - ID3D12Resource** resources[] = { - &allocation.resource_uav_state, - &allocation.resource_copy_src_state, - &allocation.resource_copy_dst_state}; - D3D12_RESOURCE_STATES states[] = { - initial_state_, - D3D12_RESOURCE_STATE_COPY_SOURCE, - D3D12_RESOURCE_STATE_COPY_DEST}; + D3D12_RESOURCE_DESC resource_desc = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_); - for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++) + HRESULT create_resource_hr = m_device->CreatePlacedResource( + allocation.heaps.front().Get(), + 0, + &resource_desc, + initial_state_, + nullptr, + IID_PPV_ARGS(&allocation.resource_uav_state)); + if (create_resource_hr == E_OUTOFMEMORY) { - HRESULT create_resource_hr = m_device->CreatePlacedResource( - allocation.heaps.front().Get(), - 0, - &resource_desc, - states[i], - nullptr, - IID_PPV_ARGS(resources[i])); - if (create_resource_hr == E_OUTOFMEMORY) - { - return absl::nullopt; - } - ORT_THROW_IF_FAILED(create_resource_hr); + return absl::nullopt; } + ORT_THROW_IF_FAILED(create_resource_hr); return allocation; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index b5492a1a86ea3..9a180f64e49db 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -37,104 +37,109 @@ namespace Dml SetCommandRecorder(&m_dmlRecorder); - // This type of copy is not common and is only used in rare circumstances. Because a resource - // cannot be both in a source and destination state at the same time (without aliasing), we copy - // the source resource to an intermediate one, and then copy the intermediate resource to the - // destination resource. - // TODO (pavignol): Only do the intermediate copy when both resources at the same - - D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; - - D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER, - 0, - byteCount, - 1, - 1, - 1, - DXGI_FORMAT_UNKNOWN, - {1, 0}, - D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; - - ComPtr intermediateBuffer; - ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource( - &heapProperties, - D3D12_HEAP_FLAG_NONE, - &resourceDesc, - D3D12_RESOURCE_STATE_COPY_DEST, - nullptr, - IID_GRAPHICS_PPV_ARGS(intermediateBuffer.GetAddressOf()))); - - std::vector barriers; - - if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE)) + if (dstBuffer == srcBuffer) { - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE)); - m_dmlRecorder.ResourceBarrier(barriers); - } - - m_dmlRecorder.CopyBufferRegion(intermediateBuffer.Get(), 0, srcBuffer, srcOffset, byteCount); - - // Reset src barrier state - for (auto& barrier : barriers) - { - std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); - } - - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(intermediateBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE)); + // This type of copy is not common and is only used in rare circumstances. Because a resource + // cannot be both in a source and destination state at the same time (without aliasing), we copy + // the source resource to an intermediate one, and then copy the intermediate resource to the + // destination resource. + // TODO (pavignol): Only do the intermediate copy when both resources at the same + + D3D12_HEAP_PROPERTIES heapProperties = { + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + + D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER, + 0, + byteCount, + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + {1, 0}, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; + + ComPtr intermediateBuffer; + ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource( + &heapProperties, + D3D12_HEAP_FLAG_NONE, + &resourceDesc, + D3D12_RESOURCE_STATE_COPY_DEST, + nullptr, + IID_GRAPHICS_PPV_ARGS(intermediateBuffer.GetAddressOf()))); + + std::vector barriers; + + if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE)); + m_dmlRecorder.ResourceBarrier(barriers); + } - if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) - { - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST)); - } + m_dmlRecorder.CopyBufferRegion(intermediateBuffer.Get(), 0, srcBuffer, srcOffset, byteCount); - m_dmlRecorder.ResourceBarrier(barriers); - m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount); + // Reset src barrier state + for (auto& barrier : barriers) + { + std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); + } - barriers.clear(); + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(intermediateBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE)); - // Reset dst barrier state - if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) - { - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); - } + if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST)); + } - m_dmlRecorder.ResourceBarrier(barriers); + m_dmlRecorder.ResourceBarrier(barriers); + m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount); - // Keep the intermediate buffer alive until we're done with it - QueueReference(intermediateBuffer.Get()); + barriers.clear(); + // Reset dst barrier state + if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); + } -/* - std::vector barriers; + // Since this copy may write to GPU memory, we also need to perform an aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); + m_dmlRecorder.ResourceBarrier(barriers); - if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) - { - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST)); + // Keep the intermediate buffer alive until we're done with it + QueueReference(intermediateBuffer.Get()); } - if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE)) + else { - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE)); - } + std::vector barriers; - if (!barriers.empty()) - { - m_dmlRecorder.ResourceBarrier(barriers); - } + if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST)); + } + if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE)) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE)); + } - m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount); + if (!barriers.empty()) + { + m_dmlRecorder.ResourceBarrier(barriers); + } - // Reset barrier state - for (auto& barrier : barriers) - { - std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); - } + m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount); - // Since this copy may write to GPU memory, we also need to perform an - // aliasing barrier - m_dmlRecorder.ResourceBarrier(barriers); -*/ + // Reset barrier state + for (auto& barrier : barriers) + { + std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); + } + + // Since this copy may write to GPU memory, we also need to perform an + // aliasing barrier + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); + m_dmlRecorder.ResourceBarrier(barriers); + } } void ExecutionContext::FillBufferWithPattern( From b2e65fc3479883a8a5371ac9b13d14b4a9343fec Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 12 Jul 2023 15:31:33 -0700 Subject: [PATCH 47/76] Remove rest of Aliasing --- .../src/ExecutionContext.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index 9a180f64e49db..4ff464e0eef42 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -94,18 +94,14 @@ namespace Dml m_dmlRecorder.ResourceBarrier(barriers); m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount); - barriers.clear(); - // Reset dst barrier state if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST)) { + barriers.clear(); barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState)); + m_dmlRecorder.ResourceBarrier(barriers); } - // Since this copy may write to GPU memory, we also need to perform an aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); - m_dmlRecorder.ResourceBarrier(barriers); - // Keep the intermediate buffer alive until we're done with it QueueReference(intermediateBuffer.Get()); } @@ -135,10 +131,10 @@ namespace Dml std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter); } - // Since this copy may write to GPU memory, we also need to perform an - // aliasing barrier - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)); - m_dmlRecorder.ResourceBarrier(barriers); + if (!barriers.empty()) + { + m_dmlRecorder.ResourceBarrier(barriers); + } } } From d5be4f14154583497d0f33b9c97b61f90ab96219 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 12 Jul 2023 23:46:59 -0700 Subject: [PATCH 48/76] Add BFC allocator --- .../inc/DmlExecutionProvider.h | 3 +- .../inc/IWinmlExecutionProvider.h | 6 +- .../src/AbiCustomRegistry.cpp | 6 + .../src/AbiCustomRegistry.h | 12 +- .../src/BucketizedBufferAllocator.cpp | 250 ++++++++++++++++++ .../src/BucketizedBufferAllocator.h | 90 +++++++ .../src/DmlAllocationInfo.cpp | 3 +- .../src/DmlAllocationInfo.h | 20 +- .../DmlExecutionProvider/src/DmlBuffer.cpp | 3 +- .../src/DmlCommittedResourceWrapper.h | 2 +- .../src/DmlGpuAllocator.cpp | 83 +++++- .../src/DmlGpuAllocator.h | 26 +- .../src/DmlGraphFusionHelper.cpp | 7 +- .../src/DmlReservedResourceSubAllocator.cpp | 23 +- .../src/DmlReservedResourceSubAllocator.h | 11 +- .../src/DmlReservedResourceWrapper.h | 2 +- .../src/DmlResourceWrapper.h | 2 +- .../src/DmlSubAllocator.h | 15 ++ .../src/ExecutionProvider.cpp | 41 ++- .../src/ExecutionProvider.h | 14 +- .../src/MLOperatorAuthorImpl.cpp | 5 +- .../MLOperatorAuthorPrivate.h | 2 + .../providers/dml/dml_provider_factory.cc | 6 +- 23 files changed, 563 insertions(+), 69 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index 5f9c4394e740f..ef012855770f3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -31,10 +31,11 @@ namespace Dml ID3D12CommandQueue* commandQueue, bool enableMetacommands = true); - D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t size_in_bytes); + D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t size_in_bytes); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); + void DisableBfcAllocator(onnxruntime::IExecutionProvider* provider); onnxruntime::common::Status CopyTensor( onnxruntime::IExecutionProvider* provider, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index 0daee39da337b..9f4ec88083b7c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -40,11 +40,9 @@ namespace Windows::AI::MachineLearning::Adapter // the provider's underlying queues. virtual void QueueReference(IUnknown *object) = 0; - virtual Dml::D3D12BufferRegion GetBufferRegion(const Dml::TaggedPointer& taggedPointer, uint64_t size) const = 0; + virtual Dml::D3D12BufferRegion GetBufferRegion(void* opaquePointer, uint64_t size) const = 0; - virtual uint64_t TryGetPooledAllocationId( - const Dml::TaggedPointer& taggedPointer, - bool isInternalOperator) = 0; + virtual uint64_t GetUniqueId(void* opaquePointer) = 0; virtual void GetABIExecutionInterfaceAndInvalidateState( bool isInternalOperator, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp index ede3e7f2c2257..d3de2fbfe31d3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp @@ -561,6 +561,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel( // // For backward compatibility, this does not propagate errors for external operators static_cast(m_kernelRegistry->RegisterCustomKernel(create_info)); // ignore result + m_hasExternalOperators = true; } return S_OK; @@ -568,4 +569,9 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel( ORT_CATCH_RETURN } +bool STDMETHODCALLTYPE AbiCustomRegistry::HasExternalOperators() const noexcept +{ + return m_hasExternalOperators; +} + } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h index d6b1448b559b1..926eb02b44918 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h @@ -15,7 +15,7 @@ namespace WRL } namespace Windows::AI::MachineLearning::Adapter -{ +{ using namespace Microsoft::WRL; @@ -49,6 +49,8 @@ class AbiCustomRegistry : public WRL::Base> GetRegistries() { std::list> registries; @@ -56,7 +58,7 @@ class AbiCustomRegistry : public WRL::Base m_kernelRegistry; @@ -107,6 +109,8 @@ class AbiCustomRegistry : public WRL::Base m_internalRegInfoMap; + mutable bool m_hasExternalOperators = false; + }; } // namespace Windows::AI::MachineLearning::Adapter diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp new file mode 100644 index 0000000000000..c071e4bf0b8d3 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -0,0 +1,250 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" + +#include "core/session/onnxruntime_c_api.h" + +#include "BucketizedBufferAllocator.h" +#include "DmlAllocationInfo.h" +#include "DmlCommittedResourceWrapper.h" +// #define PRINT_OUTSTANDING_ALLOCATIONS + +namespace Dml +{ + BucketizedBufferAllocator::~BucketizedBufferAllocator() + { +#ifdef PRINT_OUTSTANDING_ALLOCATIONS + if (!m_outstandingAllocationsById.empty()) + { + printf("BucketizedBufferAllocator outstanding allocation indices:\n"); + for (auto& entry : m_outstandingAllocationsById) + { + printf("%u\n", static_cast(entry.first)); + } + printf("\n"); + } +#endif + } + + BucketizedBufferAllocator::BucketizedBufferAllocator( + ID3D12Device* device, + std::shared_ptr context, + const D3D12_HEAP_PROPERTIES& heapProps, + D3D12_HEAP_FLAGS heapFlags, + D3D12_RESOURCE_FLAGS resourceFlags, + D3D12_RESOURCE_STATES initialState) + : onnxruntime::IAllocator( + OrtMemoryInfo( + "DML", + OrtAllocatorType::OrtDeviceAllocator, + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0) + ) + ), + m_device(device), + m_heapProperties(heapProps), + m_heapFlags(heapFlags), + m_resourceFlags(resourceFlags), + m_initialState(initialState), + m_context(context) + { + } + + /*static*/ gsl::index BucketizedBufferAllocator::GetBucketIndexFromSize(uint64_t size) + { + assert(size != 0); + + // Each bucket is twice as large as the previous one, in ascending order + gsl::index index = static_cast(ceil(log2(size))); + assert((1ull << index) >= size); // This must be true unless there were some strange rounding issues + + // The smallest bucket is 2^n bytes large, where n = c_minResourceSizeExponent + index = std::max(index, c_minResourceSizeExponent); + index -= c_minResourceSizeExponent; + + return index; + } + + /*static*/ uint64_t BucketizedBufferAllocator::GetBucketSizeFromIndex(gsl::index index) + { + return (1ull << (index + c_minResourceSizeExponent)); + } + + ComPtr BucketizedBufferAllocator::AllocCommittedResource(size_t size) + { + ComPtr resource; + auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + ORT_THROW_IF_FAILED(m_device->CreateCommittedResource( + &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), + D3D12_HEAP_FLAG_NONE, + &buffer, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + nullptr, + IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf()) + )); + + ComPtr resourceWrapper; + wil::MakeOrThrow(std::move(resource)).As(&resourceWrapper); + return resourceWrapper; + } + + AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(void* opaquePointer) + { + return static_cast(opaquePointer); + } + + D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const + { + auto allocationInfo = static_cast(opaquePointer); + return D3D12BufferRegion(0, size_in_bytes, allocationInfo->GetD3D12Resource()); + } + + void* BucketizedBufferAllocator::Alloc(size_t size) + { + // For some reason lotus likes requesting 0 bytes of memory + size = std::max(1, size); + + ComPtr resourceWrapper; + uint64_t resourceId = 0; + uint64_t bucketSize = 0; + + // Use a pooled resource if the size (post rounding, if requested) matches a bucket size + if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == GetBucketSizeFromIndex(GetBucketIndexFromSize(size))) + { + Bucket* bucket = nullptr; + + // Find the bucket for this allocation size + gsl::index bucketIndex = GetBucketIndexFromSize(size); + + if (gsl::narrow_cast(m_pool.size()) <= bucketIndex) + { + // Ensure there are sufficient buckets + m_pool.resize(bucketIndex + 1); + } + + bucket = &m_pool[bucketIndex]; + bucketSize = GetBucketSizeFromIndex(bucketIndex); + + if (bucket->resources.empty()) + { + // No more resources in this bucket - allocate a new one + resourceWrapper = AllocCommittedResource(onnxruntime::narrow(bucketSize)); + resourceId = ++m_currentResourceId; + } + else + { + // Retrieve a resource from the bucket + resourceWrapper = std::move(bucket->resources.back().resource); + resourceId = bucket->resources.back().resourceId; + bucket->resources.pop_back(); + } + } + else + { + // The allocation will not be pooled. Construct a new one + bucketSize = (size + 3) & ~3; + resourceWrapper = AllocCommittedResource(onnxruntime::narrow(bucketSize)); + resourceId = ++m_currentResourceId; + } + + assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize); + assert(resourceWrapper != nullptr); + + ComPtr allocInfo = wil::MakeOrThrow( + this, + ++m_currentAllocationId, + resourceId, + resourceWrapper.Get(), + size + ); + + #if _DEBUG + m_outstandingAllocationsById[allocInfo->GetId()] = allocInfo.Get(); + #endif + + return allocInfo.Detach(); + } + + void BucketizedBufferAllocator::Free(void* p) + { + // Release Lotus's reference on the allocation. The allocation + // also inherits IUnknown, and once its final reference reaches zero + // it will call FreeResource + ComPtr allocInfo; + allocInfo.Attach(static_cast(p)); + } + + uint64_t BucketizedBufferAllocator::GetUniqueId(void* opaquePointer) + { + const auto* allocInfo = static_cast(opaquePointer); + return allocInfo->GetPooledResourceId(); + } + + void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId) + { + assert(allocInfo != nullptr); // Can't free nullptr + + if (allocInfo->GetOwner() != this) + { + // This allocation doesn't belong to this allocator! + ORT_THROW_HR(E_INVALIDARG); + } + + // Free the resource to the pool if its size matches a bucket size + gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); + if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetD3D12Resource()->GetDesc().Width) + { + assert(gsl::narrow_cast(m_pool.size()) > bucketIndex); + + // Return the resource to the bucket + Bucket* bucket = &m_pool[bucketIndex]; + + Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId}; + bucket->resources.push_back(resource); + } + else + { + // Free the underlying allocation once queued work has completed. +#ifdef _GAMING_XBOX + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetD3D12Resource()).Get()); +#else + m_context->QueueReference(allocInfo->GetD3D12Resource()); +#endif + allocInfo->DetachResourceWrapper(); + } + + #if _DEBUG + assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo); + m_outstandingAllocationsById.erase(allocInfo->GetId()); + #endif + + // The allocation info is already destructing at this point + } + + + const AllocationInfo* BucketizedBufferAllocator::DecodeDataHandle(const void* opaqueHandle) + { + if (opaqueHandle == nullptr) + { + // There is no memory allocated which needs to be decoded. + ORT_THROW_HR(E_INVALIDARG); + } + const auto* allocInfo = static_cast(opaqueHandle); + + auto owner = allocInfo->GetOwner(); + //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource + if (owner != nullptr && owner != this) + { + // This allocation doesn't belong to this allocator! + ORT_THROW_HR(E_INVALIDARG); + } + + return allocInfo; + } + + void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) + { + m_defaultRoundingMode = roundingMode; + } + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h new file mode 100644 index 0000000000000..f0fc570d4e1c4 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/framework/allocator.h" +#include "ExecutionContext.h" +#include "DmlResourceWrapper.h" +#include "DmlSubAllocator.h" + +namespace Dml +{ + class BucketizedBufferAllocator; + + // Implements a Lotus allocator for D3D12 heap buffers, using a bucket allocation strategy. The allocator + // maintains a set of fixed-size buckets, with each bucket containing one or more D3D12 buffers of that fixed size. + // All requested allocation sizes are rounded up to the nearest bucket size, which ensures minimal fragmentation + // while providing an upper bound on the amount of memory "wasted" with each allocation. + class BucketizedBufferAllocator : public onnxruntime::IAllocator, public DmlSubAllocator + { + public: + ~BucketizedBufferAllocator(); + + // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties, + // resource flags, and initial resource state. + BucketizedBufferAllocator( + ID3D12Device* device, + std::shared_ptr context, + const D3D12_HEAP_PROPERTIES& heapProps, + D3D12_HEAP_FLAGS heapFlags, + D3D12_RESOURCE_FLAGS resourceFlags, + D3D12_RESOURCE_STATES initialState); + + ComPtr AllocCommittedResource(size_t size); + + // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc. + const AllocationInfo* DecodeDataHandle(const void* opaqueHandle); + + void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); + + AllocationInfo* GetAllocationInfo(void* opaquePointer); + D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const; + uint64_t GetUniqueId(void* opaquePointer); + + public: // onnxruntime::IAllocator + void* Alloc(size_t size) final; + void Free(void* p) final; + + private: + static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB + + // The pool consists of a number of buckets, and each bucket contains a number of resources of the same size. + // The resources in each bucket are always sized as a power of two, and each bucket contains resources twice + // as large as the previous bucket. + struct Resource + { + ComPtr resource; + uint64_t resourceId; + }; + + struct Bucket + { + std::vector resources; + }; + + static gsl::index GetBucketIndexFromSize(uint64_t size); + static uint64_t GetBucketSizeFromIndex(gsl::index index); + + friend class AllocationInfo; + void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) final; + + ComPtr m_device; + D3D12_HEAP_PROPERTIES m_heapProperties; + D3D12_HEAP_FLAGS m_heapFlags; + D3D12_RESOURCE_FLAGS m_resourceFlags; + D3D12_RESOURCE_STATES m_initialState; + + std::vector m_pool; + size_t m_currentAllocationId = 0; + uint64_t m_currentResourceId = 0; + AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled; + std::shared_ptr m_context; + + #if _DEBUG + // Useful for debugging; keeps track of all allocations that haven't been freed yet + std::map m_outstandingAllocationsById; + #endif + }; + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp index 52944d2c8b96a..5db1289778819 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp @@ -4,6 +4,7 @@ #include "precomp.h" #include "DmlAllocationInfo.h" #include "DmlReservedResourceSubAllocator.h" +#include "DmlSubAllocator.h" namespace Dml { @@ -12,7 +13,7 @@ namespace Dml { if (m_owner) { - m_owner->FreeResource(this); + m_owner->FreeResource(this, m_pooledResourceId); } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index 7c11358bb106d..ee203ba47056e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -7,33 +7,35 @@ namespace Dml { - class DmlReservedResourceSubAllocator; + class DmlSubAllocator; class AllocationInfo : public Microsoft::WRL::RuntimeClass< Microsoft::WRL::RuntimeClassFlags, IUnknown> { public: AllocationInfo( - DmlReservedResourceSubAllocator* owner, + DmlSubAllocator* owner, size_t id, + uint64_t pooledResourceId, DmlResourceWrapper* resourceWrapper, size_t requestedSize) : m_owner(owner) , m_allocationId(id) + , m_pooledResourceId(pooledResourceId) , m_resourceWrapper(resourceWrapper) , m_requestedSize(requestedSize) {} ~AllocationInfo(); - DmlReservedResourceSubAllocator* GetOwner() const + DmlSubAllocator* GetOwner() const { return m_owner; } - ID3D12Resource* GetUavResource() const + ID3D12Resource* GetD3D12Resource() const { - return m_resourceWrapper->GetUavResource(); + return m_resourceWrapper->GetD3D12Resource(); } ComPtr DetachResourceWrapper() const @@ -51,9 +53,15 @@ namespace Dml return m_allocationId; } + uint64_t GetPooledResourceId() const + { + return m_pooledResourceId; + } + private: - DmlReservedResourceSubAllocator* m_owner; + DmlSubAllocator* m_owner; size_t m_allocationId; // For debugging purposes + uint64_t m_pooledResourceId; Microsoft::WRL::ComPtr m_resourceWrapper; // The size requested during Alloc(), which may be smaller than the physical resource size diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index 464ce26c16f54..0b670a22f9cbd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -4,7 +4,6 @@ #include "precomp.h" #include "DmlBuffer.h" #include "DmlGpuAllocator.h" -#include "DmlTaggedPointer.h" namespace Dml { @@ -15,7 +14,7 @@ namespace Dml m_opaqueData = allocator_->Alloc(size_in_bytes); ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr); - buffer_region_ = allocator_->CreateBufferRegion(TaggedPointer::Unpack(m_opaqueData), size_in_bytes); + buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes); } DmlBuffer::~DmlBuffer() diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h index 4b9c167dfe671..73454d5d0dee0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h @@ -11,7 +11,7 @@ namespace Dml DmlCommittedResourceWrapper(ComPtr&& d3d12Resource) : m_d3d12Resource(std::move(d3d12Resource)) {} // Committed resources use the same resource for all states and use barriers to transition between states - ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); } + ID3D12Resource* GetD3D12Resource() const final { return m_d3d12Resource.Get(); } private: ComPtr m_d3d12Resource; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index f2b62f2d41e64..e2606433ec5b2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -8,10 +8,15 @@ #include "core/framework/allocator.h" #include "DmlReservedResourceSubAllocator.h" #include "DmlTaggedPointer.h" +#include "DmlAllocationInfo.h" +#include "BucketizedBufferAllocator.h" namespace Dml { - DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator) + DmlGpuAllocator::DmlGpuAllocator( + onnxruntime::IAllocator* bfcAllocator, + BucketizedBufferAllocator* bucketizedBufferAllocator, + std::shared_ptr bfcSubAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( onnxruntime::DML, @@ -21,31 +26,73 @@ namespace Dml ) ), m_bfcAllocator(bfcAllocator), - m_subAllocator(subAllocator) {} + m_bucketizedBufferAllocator(bucketizedBufferAllocator), + m_bfcSubAllocator(bfcSubAllocator), + m_activeAllocator(ActiveAllocator::BfcAllocator) {} void* DmlGpuAllocator::Alloc(size_t size_in_bytes) { - return m_bfcAllocator->Alloc(size_in_bytes); + switch(m_activeAllocator) + { + case ActiveAllocator::BfcAllocator: + return m_bfcAllocator->Alloc(size_in_bytes); + case ActiveAllocator::BucketizedBufferAllocator: + return m_bucketizedBufferAllocator->Alloc(size_in_bytes); + default: + ORT_THROW_HR(E_UNEXPECTED); + } } void DmlGpuAllocator::Free(void* ptr) { - m_bfcAllocator->Free(ptr); + switch(m_activeAllocator) + { + case ActiveAllocator::BfcAllocator: + return m_bfcAllocator->Free(ptr); + case ActiveAllocator::BucketizedBufferAllocator: + return m_bucketizedBufferAllocator->Free(ptr); + default: + ORT_THROW_HR(E_UNEXPECTED); + } } - D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes) + D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) { - return m_subAllocator->CreateBufferRegion(taggedPointer, size_in_bytes); + switch(m_activeAllocator) + { + case ActiveAllocator::BfcAllocator: + return m_bfcSubAllocator->CreateBufferRegion(opaquePointer, size_in_bytes); + case ActiveAllocator::BucketizedBufferAllocator: + return m_bucketizedBufferAllocator->CreateBufferRegion(opaquePointer, size_in_bytes); + default: + ORT_THROW_HR(E_UNEXPECTED); + } } - AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) + AllocationInfo* DmlGpuAllocator::GetAllocationInfo(void* opaquePointer) { - return m_subAllocator->GetAllocationInfo(taggedPointer); + switch(m_activeAllocator) + { + case ActiveAllocator::BfcAllocator: + return m_bfcSubAllocator->GetAllocationInfo(opaquePointer); + case ActiveAllocator::BucketizedBufferAllocator: + return m_bucketizedBufferAllocator->GetAllocationInfo(opaquePointer); + default: + ORT_THROW_HR(E_UNEXPECTED); + } } void DmlGpuAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { - m_subAllocator->SetDefaultRoundingMode(roundingMode); + switch(m_activeAllocator) + { + case ActiveAllocator::BfcAllocator: + m_bfcSubAllocator->SetDefaultRoundingMode(roundingMode); + case ActiveAllocator::BucketizedBufferAllocator: + m_bucketizedBufferAllocator->SetDefaultRoundingMode(roundingMode); + default: + ORT_THROW_HR(E_UNEXPECTED); + } } DmlBuffer DmlGpuAllocator::AllocateDefaultBuffer(uint64_t num_bytes) @@ -53,4 +100,22 @@ namespace Dml return DmlBuffer(this, num_bytes); } + uint64_t DmlGpuAllocator::GetUniqueId(void* opaquePointer) + { + switch(m_activeAllocator) + { + case ActiveAllocator::BfcAllocator: + return m_bfcSubAllocator->GetUniqueId(opaquePointer); + case ActiveAllocator::BucketizedBufferAllocator: + return m_bucketizedBufferAllocator->GetUniqueId(opaquePointer); + default: + ORT_THROW_HR(E_UNEXPECTED); + } + } + + void DmlGpuAllocator::SetActiveAllocator(ActiveAllocator activeAllocator) + { + m_activeAllocator = activeAllocator; + } + } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index 39311055503d2..955c9ca10c7d2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -10,28 +10,46 @@ namespace Dml { class DmlReservedResourceSubAllocator; + class BucketizedBufferAllocator; class AllocationInfo; struct TaggedPointer; + enum class ActiveAllocator + { + BfcAllocator, + BucketizedBufferAllocator, + }; + class DmlGpuAllocator : public onnxruntime::IAllocator { public: - DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr subAllocator); + DmlGpuAllocator( + onnxruntime::IAllocator* bfcAllocator, + BucketizedBufferAllocator* bucketizedBufferAllocator, + std::shared_ptr bfcSubAllocator); void* Alloc(size_t size_in_bytes) final; void Free(void* ptr) final; - D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); - AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); + D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes); + AllocationInfo* GetAllocationInfo(void* opaquePointer); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes); + void SetActiveAllocator(ActiveAllocator activeAllocator); + uint64_t GetUniqueId(void* opaquePointer); private: // This allocator is managed by ORT and should be used to allocate/free memory in order // to utilize the BFC acapabilities onnxruntime::IAllocator* m_bfcAllocator; + // This allocator is the old bucketized allocator that is kept for backward compatibility purposes + // and is only used when external custom ops are registered. + BucketizedBufferAllocator* m_bucketizedBufferAllocator; + // This allocator is specific to DML and is used to decode the opaque data returned by the BFC // allocator into objects that DML understands - std::shared_ptr m_subAllocator; + std::shared_ptr m_bfcSubAllocator; + + ActiveAllocator m_activeAllocator; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index dcf6b8607f319..d69bea864b518 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -103,12 +103,11 @@ namespace DmlGraphFusionHelper // The allocation is not pooled auto allocInfo = static_cast(opaqueData); *allocId = 0; - return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); + return D3D12BufferRegion(0, allocInfo->GetD3D12Resource()->GetDesc().Width, allocInfo->GetD3D12Resource()); } - auto taggedPointer = TaggedPointer::Unpack(opaqueData); - *allocId = winmlProvider->TryGetPooledAllocationId(taggedPointer, 0); - return winmlProvider->GetBufferRegion(taggedPointer, tensor->SizeInBytes()); + *allocId = winmlProvider->GetUniqueId(opaqueData); + return winmlProvider->GetBufferRegion(opaqueData, tensor->SizeInBytes()); } void ProcessInputData( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index bc9a0b15e86fe..e58303a0bfbfc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -47,7 +47,7 @@ namespace Dml void DmlReservedResourceSubAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { - m_defaultRoundingMode = roundingMode; + // Nothing to do here; kept for compatibility with the bucketized allocator } static bool GetTilingEnabled(ID3D12Device* device) @@ -248,6 +248,7 @@ namespace Dml ComPtr allocInfo = wil::MakeOrThrow( this, ++m_currentAllocationId, + 0, resourceWrapper.Get(), size_in_bytes ); @@ -285,7 +286,13 @@ namespace Dml allocations_by_id_.erase(it); } - void DmlReservedResourceSubAllocator::FreeResource(AllocationInfo* allocInfo) + uint64_t DmlReservedResourceSubAllocator::GetUniqueId(void* opaquePointer) + { + auto taggedPointer = TaggedPointer::Unpack(opaquePointer); + return taggedPointer.GetUniqueId(); + } + + void DmlReservedResourceSubAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) { // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already // close at this point if the application is winding down. @@ -338,9 +345,11 @@ namespace Dml } D3D12BufferRegion DmlReservedResourceSubAllocator::CreateBufferRegion( - const TaggedPointer& taggedPointer, + void* opaquePointer, uint64_t size_in_bytes) { + auto taggedPointer = TaggedPointer::Unpack(opaquePointer); + // We need to access (mutable) state after this point, so we need to lock std::unique_lock lock(mutex_); @@ -354,16 +363,18 @@ namespace Dml (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; // Make sure the region we're trying to create fits entirely in the resource - assert(it->second->GetUavResource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes); + assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes); return D3D12BufferRegion( taggedPointer.offset, size_in_bytes, - it->second->GetUavResource()); + it->second->GetD3D12Resource()); } - AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer) + AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(void* opaquePointer) { + auto taggedPointer = TaggedPointer::Unpack(opaquePointer); + // We need to access (mutable) state after this point, so we need to lock std::unique_lock lock(mutex_); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 8049848c8671e..3f2f1c9210c64 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -6,6 +6,7 @@ #include "ExecutionContext.h" #include "DmlAllocationInfo.h" #include "DmlBufferRegion.h" +#include "DmlSubAllocator.h" namespace Dml { @@ -36,7 +37,7 @@ namespace Dml // this case it is better make more but smaller allocations (resulting in // smaller heaps); this fallback path is only retained as a last resort for // older hardware. - class DmlReservedResourceSubAllocator + class DmlReservedResourceSubAllocator : public DmlSubAllocator { public: // Maximum size of a heap (in tiles) when allocations are tiled. Each tile @@ -60,13 +61,14 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes); - AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer); + AllocationInfo* GetAllocationInfo(void* opaquePointer); - void FreeResource(AllocationInfo* allocInfo); + void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) final; uint64_t ComputeRequiredSize(size_t size); bool TilingEnabled() const { return tiling_enabled_; }; + uint64_t GetUniqueId(void* opaquePointer); ~DmlReservedResourceSubAllocator(); @@ -106,7 +108,6 @@ namespace Dml std::vector m_pool; size_t m_currentAllocationId = 0; uint64_t m_currentResourceId = 0; - AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled; std::unique_ptr m_subAllocator; #if _DEBUG diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index 22f8cbbdc394b..de42157645bba 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -17,7 +17,7 @@ namespace Dml { } - ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); } + ID3D12Resource* GetD3D12Resource() const final { return m_allocation.resource_uav_state.Get(); } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 2b1a8e5c726dc..6ad57b055023c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -12,7 +12,7 @@ namespace Dml { public: // TODO (pavignol): Rename to GetResource() - virtual ID3D12Resource* GetUavResource() const = 0; + virtual ID3D12Resource* GetD3D12Resource() const = 0; virtual ~DmlResourceWrapper(){} }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h new file mode 100644 index 0000000000000..580830ea1a90f --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace Dml +{ + struct DmlResourceWrapper; + + class DmlSubAllocator + { + public: + virtual void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) = 0; + }; +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index b06d23adf5886..cb04939683f8c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -9,6 +9,7 @@ #include "ReadbackHeap.h" #include "ExecutionContext.h" #include "DmlReservedResourceSubAllocator.h" +#include "BucketizedBufferAllocator.h" #include "DmlCpuAllocator.h" #include "MLOperatorAuthorImpl.h" #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h" @@ -207,8 +208,16 @@ namespace Dml m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo); + m_bucketizedAllocator = std::make_shared( + m_d3d12Device.Get(), + m_context, // TODO(leca): REVIEW: Will it cause memory issue when m_context is released in EP while alloc is released in sessionState? + CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), + D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + // Wrap the BFC allocator into our own allocator - m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), subAllocator); + m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_bucketizedAllocator.get(), subAllocator); m_context->SetAllocator(m_gpuAllocator); // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators. m_cpuInputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUInput); @@ -992,15 +1001,21 @@ namespace Dml m_context->QueueReference(object); } - D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const + D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(void* opaquePointer, uint64_t size) const { - return m_gpuAllocator->CreateBufferRegion(taggedPointer, size); + return m_gpuAllocator->CreateBufferRegion(opaquePointer, size); } - uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) + uint64_t ExecutionProviderImpl::GetUniqueId(void* opaquePointer) { - assert(!isInternalOperator); - return taggedPointer.GetUniqueId(); + return m_gpuAllocator->GetUniqueId(opaquePointer); + } + + void ExecutionProviderImpl::DisableBfcAllocator() + { + // TODO (pavignol): Remove + printf("*************Disabling BFC allocator!!!\n"); + m_gpuAllocator->SetActiveAllocator(ActiveAllocator::BucketizedBufferAllocator); } void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState( @@ -1117,10 +1132,10 @@ namespace Dml return std::make_unique(dmlDevice, commandQueue, enableMetacommands); } - D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t sizeInBytes) + D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes) { - Dml::DmlGpuAllocator* pAllocationInfo = static_cast(allocator); - return pAllocationInfo->CreateBufferRegion(taggedPointer, sizeInBytes); + Dml::DmlGpuAllocator* gpuAllocator = static_cast(allocator); + return gpuAllocator->CreateBufferRegion(opaquePointer, sizeInBytes); } void FlushContext(onnxruntime::IExecutionProvider* provider) @@ -1141,6 +1156,12 @@ namespace Dml dmlexecutionprovider->ReleaseCompletedReferences(); } + void DisableBfcAllocator(onnxruntime::IExecutionProvider * provider) + { + ExecutionProvider* dmlexecutionprovider = static_cast(provider); + dmlexecutionprovider->DisableBfcAllocator(); + } + onnxruntime::common::Status CopyTensor( onnxruntime::IExecutionProvider* provider, const onnxruntime::Tensor& src, @@ -1156,7 +1177,7 @@ namespace Dml ComPtr resourceWrapper; wil::MakeOrThrow(pResource).As(&resourceWrapper); - ComPtr allocInfo = wil::MakeOrThrow(nullptr, 0, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width); + ComPtr allocInfo = wil::MakeOrThrow(nullptr, 0, 0, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width); return allocInfo.Detach(); } void FreeGPUAllocation(void* ptr) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index ad208ea830ae5..dafc2ab7147f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -25,6 +25,7 @@ namespace Dml class ReadbackHeap; class ExecutionContext; class DmlReservedResourceSubAllocator; + class BucketizedBufferAllocator; class DmlCpuAllocator; class ExecutionProvider; class DmlGpuAllocator; @@ -42,6 +43,8 @@ namespace Dml bool enableMetacommands = true); void ReleaseCompletedReferences(); + void DisableBfcAllocator(); + uint64_t GetUniqueId(void* opaquePointer); public: // implements Dml::IExecutionProvider STDMETHOD(GetD3DDevice)(_COM_Outptr_ ID3D12Device** d3dDevice) const noexcept final; @@ -100,9 +103,7 @@ namespace Dml // IWinmlExecutionProvider methods void QueueReference(IUnknown* object) override; - D3D12BufferRegion GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const override; - - uint64_t TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) override; + D3D12BufferRegion GetBufferRegion(void* opaquePointer, uint64_t size) const override; void GetABIExecutionInterfaceAndInvalidateState( bool isInternalOperator, @@ -181,8 +182,8 @@ namespace Dml std::shared_ptr m_context; std::unique_ptr m_uploadHeap; std::unique_ptr m_readbackHeap; - std::shared_ptr m_subAllocator; std::shared_ptr m_bfcAllocator; + std::shared_ptr m_bucketizedAllocator; std::shared_ptr m_gpuAllocator; std::shared_ptr m_externalGpuAllocator; std::shared_ptr m_cpuInputAllocator; @@ -292,6 +293,11 @@ namespace Dml return m_impl->ReleaseCompletedReferences(); } + void DisableBfcAllocator() + { + return m_impl->DisableBfcAllocator(); + } + ExecutionProviderImpl* GetImpl() { return m_impl.Get(); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index dde290f0bce0f..1547bd99b6e20 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1617,11 +1617,10 @@ namespace Windows::AI::MachineLearning::Adapter if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { auto allocInfo = static_cast(m_tensorData); - return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource()); + return Dml::D3D12BufferRegion(0, allocInfo->GetD3D12Resource()->GetDesc().Width, allocInfo->GetD3D12Resource()); } - auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData); - return m_winmlExecutionProvider->GetBufferRegion(taggedPointer, m_impl->SizeInBytes()); + return m_winmlExecutionProvider->GetBufferRegion(m_tensorData, m_impl->SizeInBytes()); } uint32_t STDMETHODCALLTYPE TensorWrapper::GetDimensionCount() const noexcept diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h index d1a705e151ddf..9909be1f8337f 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h @@ -177,6 +177,8 @@ IMLOperatorRegistryPrivate : public IUnknown _In_reads_(constantCpuInputCount) const uint32_t* constantCpuInputs = nullptr, uint32_t constantCpuInputCount = 0 ) const noexcept PURE; + + STDMETHOD_(bool, HasExternalOperators)() const noexcept PURE; }; //! \interface IMLOperatorTensorShapeDescription1 diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 91279be185ba9..c226b83e3ad1b 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -225,7 +225,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The resource has been allocated with "); } - *d3d_resource = static_cast(allocation)->GetUavResource(); + *d3d_resource = static_cast(allocation)->GetD3D12Resource(); (*d3d_resource)->AddRef(); #else @@ -250,11 +250,11 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation, } if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { - *d3d_resource = static_cast(allocation)->GetUavResource(); + *d3d_resource = static_cast(allocation)->GetD3D12Resource(); *offset = 0; } else { ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT); - auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation), size_in_bytes); + auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), allocation, size_in_bytes); *offset = bufferRegion.Offset(); *d3d_resource = bufferRegion.ResourceInUavState(); } From 57723394fb1615075fb19113a18dd031c2df8339 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 01:08:08 -0700 Subject: [PATCH 49/76] Add BFC allocator API --- .../inc/DmlExecutionProvider.h | 4 +- .../src/DmlGpuAllocator.cpp | 10 ++--- .../src/DmlGpuAllocator.h | 4 +- .../src/ExecutionProvider.cpp | 42 ++++++++++--------- .../src/ExecutionProvider.h | 14 +++---- .../providers/dml/dml_provider_factory.cc | 14 ++++++- winml/adapter/winml_adapter_apis.h | 2 +- winml/adapter/winml_adapter_c_api.cpp | 2 +- winml/adapter/winml_adapter_c_api.h | 2 +- winml/adapter/winml_adapter_dml.cpp | 4 +- .../Api.Ort/OnnxruntimeDmlSessionBuilder.cpp | 12 ++++-- .../Api.Ort/OnnxruntimeDmlSessionBuilder.h | 10 ++++- .../lib/Api.Ort/OnnxruntimeEngineBuilder.cpp | 11 +++-- winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h | 6 ++- winml/lib/Api/LearningModelSession.cpp | 11 +++-- winml/lib/Common/inc/iengine.h | 5 ++- winml/test/adapter/AdapterDmlEpTest.cpp | 4 +- winml/test/adapter/AdapterSessionTest.cpp | 6 +-- 18 files changed, 100 insertions(+), 63 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index ef012855770f3..755bf60195e2e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -29,13 +29,13 @@ namespace Dml std::unique_ptr CreateExecutionProvider( IDMLDevice* dmlDevice, ID3D12CommandQueue* commandQueue, - bool enableMetacommands = true); + bool enableMetacommands, + bool enableBfcAllocator); D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t size_in_bytes); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); - void DisableBfcAllocator(onnxruntime::IExecutionProvider* provider); onnxruntime::common::Status CopyTensor( onnxruntime::IExecutionProvider* provider, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index e2606433ec5b2..5ac6485a041ec 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -16,7 +16,8 @@ namespace Dml DmlGpuAllocator::DmlGpuAllocator( onnxruntime::IAllocator* bfcAllocator, BucketizedBufferAllocator* bucketizedBufferAllocator, - std::shared_ptr bfcSubAllocator) + std::shared_ptr bfcSubAllocator, + ActiveAllocator activeAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( onnxruntime::DML, @@ -28,7 +29,7 @@ namespace Dml m_bfcAllocator(bfcAllocator), m_bucketizedBufferAllocator(bucketizedBufferAllocator), m_bfcSubAllocator(bfcSubAllocator), - m_activeAllocator(ActiveAllocator::BfcAllocator) {} + m_activeAllocator(activeAllocator) {} void* DmlGpuAllocator::Alloc(size_t size_in_bytes) { @@ -113,9 +114,4 @@ namespace Dml } } - void DmlGpuAllocator::SetActiveAllocator(ActiveAllocator activeAllocator) - { - m_activeAllocator = activeAllocator; - } - } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index 955c9ca10c7d2..e8b020a85767b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -26,7 +26,8 @@ namespace Dml DmlGpuAllocator( onnxruntime::IAllocator* bfcAllocator, BucketizedBufferAllocator* bucketizedBufferAllocator, - std::shared_ptr bfcSubAllocator); + std::shared_ptr bfcSubAllocator, + ActiveAllocator activeAllocator); void* Alloc(size_t size_in_bytes) final; void Free(void* ptr) final; @@ -34,7 +35,6 @@ namespace Dml AllocationInfo* GetAllocationInfo(void* opaquePointer); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes); - void SetActiveAllocator(ActiveAllocator activeAllocator); uint64_t GetUniqueId(void* opaquePointer); private: diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index cb04939683f8c..9ac474474f8b8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -74,7 +74,8 @@ namespace Dml ExecutionProvider::ExecutionProvider( IDMLDevice* dmlDevice, ID3D12CommandQueue* commandQueue, - bool enableMetacommands) : + bool enableMetacommands, + bool enableBfcAllocator) : IExecutionProvider(onnxruntime::kDmlExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)) { D3D12_COMMAND_LIST_TYPE queueType = commandQueue->GetDesc().Type; @@ -87,7 +88,7 @@ namespace Dml ComPtr device; GRAPHICS_THROW_IF_FAILED(commandQueue->GetDevice(IID_GRAPHICS_PPV_ARGS(device.GetAddressOf()))); - m_impl = wil::MakeOrThrow(dmlDevice, device.Get(), commandQueue, enableMetacommands); + m_impl = wil::MakeOrThrow(dmlDevice, device.Get(), commandQueue, enableMetacommands, enableBfcAllocator); } std::vector> @@ -142,10 +143,16 @@ namespace Dml // Task 24384515: Update ORT AIInfra release agent pool to install 19H1 SDK on VM bootstrap #define D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE ((D3D_FEATURE_LEVEL)0x1000) - ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands) + ExecutionProviderImpl::ExecutionProviderImpl( + IDMLDevice* dmlDevice, + ID3D12Device* d3d12Device, + ID3D12CommandQueue* queue, + bool enableMetacommands, + bool enableBfcAllocator) : m_d3d12Device(d3d12Device), m_dmlDevice(dmlDevice), m_areMetacommandsEnabled(enableMetacommands), + m_bfcAllocatorEnabled(enableBfcAllocator), m_queue(queue) { @@ -216,8 +223,17 @@ namespace Dml D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + if (!m_bfcAllocatorEnabled) + { + printf("*************BFC ALLOCATOR DISABLED!\n"); + } + // Wrap the BFC allocator into our own allocator - m_gpuAllocator = std::make_shared(m_bfcAllocator.get(), m_bucketizedAllocator.get(), subAllocator); + m_gpuAllocator = std::make_shared( + m_bfcAllocator.get(), + m_bucketizedAllocator.get(), + subAllocator, + m_bfcAllocatorEnabled ? ActiveAllocator::BfcAllocator : ActiveAllocator::BucketizedBufferAllocator); m_context->SetAllocator(m_gpuAllocator); // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators. m_cpuInputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUInput); @@ -1011,13 +1027,6 @@ namespace Dml return m_gpuAllocator->GetUniqueId(opaquePointer); } - void ExecutionProviderImpl::DisableBfcAllocator() - { - // TODO (pavignol): Remove - printf("*************Disabling BFC allocator!!!\n"); - m_gpuAllocator->SetActiveAllocator(ActiveAllocator::BucketizedBufferAllocator); - } - void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState( bool isInternalOperator, IUnknown** abiExecutionObject) const @@ -1127,9 +1136,10 @@ namespace Dml std::unique_ptr CreateExecutionProvider( IDMLDevice* dmlDevice, ID3D12CommandQueue* commandQueue, - bool enableMetacommands) + bool enableMetacommands, + bool enableBfcAllocator) { - return std::make_unique(dmlDevice, commandQueue, enableMetacommands); + return std::make_unique(dmlDevice, commandQueue, enableMetacommands, enableBfcAllocator); } D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes) @@ -1156,12 +1166,6 @@ namespace Dml dmlexecutionprovider->ReleaseCompletedReferences(); } - void DisableBfcAllocator(onnxruntime::IExecutionProvider * provider) - { - ExecutionProvider* dmlexecutionprovider = static_cast(provider); - dmlexecutionprovider->DisableBfcAllocator(); - } - onnxruntime::common::Status CopyTensor( onnxruntime::IExecutionProvider* provider, const onnxruntime::Tensor& src, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index dafc2ab7147f7..74f56acb345ed 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -40,10 +40,10 @@ namespace Dml IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, - bool enableMetacommands = true); + bool enableMetacommands, + bool enableBfcAllocator); void ReleaseCompletedReferences(); - void DisableBfcAllocator(); uint64_t GetUniqueId(void* opaquePointer); public: // implements Dml::IExecutionProvider @@ -178,6 +178,7 @@ namespace Dml ComPtr m_dmlDevice; bool m_isMcdmDevice = false; bool m_areMetacommandsEnabled = true; + bool m_bfcAllocatorEnabled = true; bool m_native16BitShaderOpsSupported = false; std::shared_ptr m_context; std::unique_ptr m_uploadHeap; @@ -235,8 +236,8 @@ namespace Dml explicit ExecutionProvider( IDMLDevice* dmlDevice, ID3D12CommandQueue* commandQueue, - bool enableMetacommands = true - ); + bool enableMetacommands, + bool enableBfcAllocator); std::unique_ptr GetDataTransfer() const final override { @@ -293,11 +294,6 @@ namespace Dml return m_impl->ReleaseCompletedReferences(); } - void DisableBfcAllocator() - { - return m_impl->DisableBfcAllocator(); - } - ExecutionProviderImpl* GetImpl() { return m_impl.Get(); diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index c226b83e3ad1b..b2d02715bb91d 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -35,18 +35,19 @@ struct DMLProviderFactory : IExecutionProviderFactory { std::unique_ptr CreateProvider() override; void SetDefaultRoundingMode(AllocatorRoundingMode rounding_mode); - void SetMetacommandsEnabled(bool metacommands_enabled); + void SetBfcAllocatorEnabled(bool bfc_allocator_enabled); private: ComPtr dml_device_{}; ComPtr cmd_queue_{}; AllocatorRoundingMode rounding_mode_ = AllocatorRoundingMode::Enabled; bool metacommands_enabled_ = true; + bool bfc_allocator_enabled_ = true; }; std::unique_ptr DMLProviderFactory::CreateProvider() { - auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), cmd_queue_.Get(), metacommands_enabled_); + auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), cmd_queue_.Get(), metacommands_enabled_, bfc_allocator_enabled_); Dml::SetDefaultRoundingMode(provider.get(), rounding_mode_); return provider; } @@ -59,6 +60,10 @@ void DMLProviderFactory::SetMetacommandsEnabled(bool metacommands_enabled) { metacommands_enabled_ = metacommands_enabled; } +void DMLProviderFactory::SetBfcAllocatorEnabled(bool bfc_allocator_enabled) { + bfc_allocator_enabled_ = bfc_allocator_enabled; +} + std::shared_ptr CreateExecutionProviderFactory_DML(IDMLDevice* dml_device, ID3D12CommandQueue* cmd_queue) { #ifndef _GAMING_XBOX @@ -92,6 +97,11 @@ void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* f dml_provider_factory->SetMetacommandsEnabled(metacommandsEnabled); } +void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled) { + auto dml_provider_factory = static_cast(factory); + dml_provider_factory->SetBfcAllocatorEnabled(bfc_allocator_enabled); +} + bool IsSoftwareAdapter(IDXGIAdapter1* adapter) { DXGI_ADAPTER_DESC1 desc; diff --git a/winml/adapter/winml_adapter_apis.h b/winml/adapter/winml_adapter_apis.h index 7d1f7941f9865..6a31676c048a5 100644 --- a/winml/adapter/winml_adapter_apis.h +++ b/winml/adapter/winml_adapter_apis.h @@ -42,7 +42,7 @@ ORT_API_STATUS(ModelGetMetadata, _In_ const OrtModel* model, _In_ size_t count, ORT_API_STATUS(ModelEnsureNoFloat16, _In_ const OrtModel* model); ORT_API_STATUS(SaveModel, _In_ const OrtModel* in, _In_ const wchar_t* const file_name, _In_ size_t len); -ORT_API_STATUS(OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSessionOptions* options, _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* cmd_queue, bool metacommands_enabled); +ORT_API_STATUS(OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSessionOptions* options, _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* cmd_queue, bool metacommands_enabled, bool bfc_allocator_enabled); // OrtSession methods ORT_API_STATUS(CreateSessionWithoutModel, _In_ OrtEnv* env, _In_ const OrtSessionOptions* options, _In_ OrtThreadPool* inter_op_thread_pool, _In_ OrtThreadPool* intra_op_thread_pool, _Outptr_ OrtSession** session); diff --git a/winml/adapter/winml_adapter_c_api.cpp b/winml/adapter/winml_adapter_c_api.cpp index a3e597fb88800..86d23a362c16d 100644 --- a/winml/adapter/winml_adapter_c_api.cpp +++ b/winml/adapter/winml_adapter_c_api.cpp @@ -105,4 +105,4 @@ const WinmlAdapterApi* ORT_API_CALL OrtGetWinMLAdapter(_In_ uint32_t ort_version } return nullptr; -} \ No newline at end of file +} diff --git a/winml/adapter/winml_adapter_c_api.h b/winml/adapter/winml_adapter_c_api.h index 2817467818404..1cbd969e559b4 100644 --- a/winml/adapter/winml_adapter_c_api.h +++ b/winml/adapter/winml_adapter_c_api.h @@ -255,7 +255,7 @@ struct WinmlAdapterApi { * OrtSessionOptionsAppendExecutionProvider_DML * This api is used to add the DML EP to OrtSessionOptions. */ - OrtStatus*(ORT_API_CALL* OrtSessionOptionsAppendExecutionProvider_DML)(_In_ OrtSessionOptions* options, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled)NO_EXCEPTION; + OrtStatus*(ORT_API_CALL* OrtSessionOptionsAppendExecutionProvider_DML)(_In_ OrtSessionOptions* options, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled, bool bfc_allocator_enabled)NO_EXCEPTION; // OrtSession methods diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp index f3ffda496530f..3dc1b1bdcd55d 100644 --- a/winml/adapter/winml_adapter_dml.cpp +++ b/winml/adapter/winml_adapter_dml.cpp @@ -70,12 +70,13 @@ Microsoft::WRL::ComPtr CreateDmlDevice(ID3D12Device* d3d12Device) { namespace onnxruntime { void DmlConfigureProviderFactoryDefaultRoundingMode(onnxruntime::IExecutionProviderFactory* factory, AllocatorRoundingMode rounding_mode); void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled); +void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled); } #endif // USE_DML ORT_API_STATUS_IMPL(winmla::OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSessionOptions* options, - _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* queue, bool metacommands_enabled) { + _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* queue, bool metacommands_enabled, bool bfc_allocator_enabled) { API_IMPL_BEGIN #ifdef USE_DML auto dml_device = CreateDmlDevice(d3d_device); @@ -91,6 +92,7 @@ ORT_API_STATUS_IMPL(winmla::OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ onnxruntime::DmlConfigureProviderFactoryDefaultRoundingMode(factory, AllocatorRoundingMode::Disabled); onnxruntime::DmlConfigureProviderFactoryMetacommandsEnabled(factory, metacommands_enabled); + onnxruntime::DmlConfigureProviderFactoryBfcAllocatorEnabled(factory, bfc_allocator_enabled); #endif // USE_DML return nullptr; API_IMPL_END diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp index a02c6a0431ba6..fea3b4ebbea91 100644 --- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp @@ -12,11 +12,17 @@ using namespace _winml; -HRESULT OnnxruntimeDmlSessionBuilder::RuntimeClassInitialize(OnnxruntimeEngineFactory* engine_factory, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled) { +HRESULT OnnxruntimeDmlSessionBuilder::RuntimeClassInitialize( + OnnxruntimeEngineFactory* engine_factory, + ID3D12Device* device, + ID3D12CommandQueue* queue, + bool metacommands_enabled, + bool bfc_allocator_enabled) { engine_factory_ = engine_factory; device_.copy_from(device); queue_.copy_from(queue); metacommands_enabled_ = metacommands_enabled; + bfc_allocator_enabled_ = bfc_allocator_enabled; return S_OK; } @@ -43,7 +49,7 @@ OnnxruntimeDmlSessionBuilder::CreateSessionOptions( ort_api); // Request the dml ep - RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device_.get(), queue_.get(), metacommands_enabled_), + RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device_.get(), queue_.get(), metacommands_enabled_, bfc_allocator_enabled_), ort_api); #ifndef _WIN64 @@ -105,4 +111,4 @@ HRESULT OnnxruntimeDmlSessionBuilder::Initialize( return S_OK; } -#endif USE_DML \ No newline at end of file +#endif USE_DML diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h index 8ea4399ebfb35..261beb4191abe 100644 --- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h +++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h @@ -13,7 +13,12 @@ class OnnxruntimeDmlSessionBuilder : public Microsoft::WRL::RuntimeClass< Microsoft::WRL::RuntimeClassFlags, IOrtSessionBuilder> { public: - HRESULT RuntimeClassInitialize(OnnxruntimeEngineFactory* engine_factory, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled_); + HRESULT RuntimeClassInitialize( + OnnxruntimeEngineFactory* engine_factory, + ID3D12Device* device, + ID3D12CommandQueue* queue, + bool metacommands_enabled, + bool bfc_allocator_enabled); HRESULT STDMETHODCALLTYPE CreateSessionOptions( OrtSessionOptions** options) override; @@ -32,6 +37,7 @@ class OnnxruntimeDmlSessionBuilder : public Microsoft::WRL::RuntimeClass< winrt::com_ptr device_; winrt::com_ptr queue_; bool metacommands_enabled_ = true; + bool bfc_allocator_enabled_ = true; }; -} // namespace _winml \ No newline at end of file +} // namespace _winml diff --git a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp index f98a98476d0e6..b7ee7de25ea1e 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp @@ -28,7 +28,7 @@ STDMETHODIMP OnnxruntimeEngineBuilder::CreateEngine(_Outptr_ _winml::IEngine** o RETURN_IF_FAILED(Microsoft::WRL::MakeAndInitialize(&onnxruntime_session_builder, engine_factory_.Get())); } else { #ifdef USE_DML - RETURN_IF_FAILED(Microsoft::WRL::MakeAndInitialize(&onnxruntime_session_builder, engine_factory_.Get(), device_.Get(), queue_.Get(), metacommands_enabled_)); + RETURN_IF_FAILED(Microsoft::WRL::MakeAndInitialize(&onnxruntime_session_builder, engine_factory_.Get(), device_.Get(), queue_.Get(), metacommands_enabled_, bfc_allocator_enabled_)); #endif } @@ -86,6 +86,11 @@ STDMETHODIMP OnnxruntimeEngineBuilder::SetMetacommandsEnabled(int enabled) { return S_OK; } +STDMETHODIMP OnnxruntimeEngineBuilder::SetBfcAllocatorEnabled(int enabled) { + bfc_allocator_enabled_ = static_cast(enabled); + return S_OK; +} + STDMETHODIMP OnnxruntimeEngineBuilder::GetID3D12CommandQueue(_Outptr_ ID3D12CommandQueue** queue) { *queue = queue_.Get(); return S_OK; @@ -100,7 +105,7 @@ STDMETHODIMP OnnxruntimeEngineBuilder::SetNamedDimensionOverrides(wfc::IMapView< named_dimension_overrides_ = std::move(named_dimension_overrides); return S_OK; } - + STDMETHODIMP OnnxruntimeEngineBuilder::SetIntraOpNumThreadsOverride(uint32_t intra_op_num_threads) { intra_op_num_threads_override_ = intra_op_num_threads; return S_OK; @@ -114,4 +119,4 @@ STDMETHODIMP OnnxruntimeEngineBuilder::SetIntraOpThreadSpinning(bool allow_spinn STDMETHODIMP OnnxruntimeEngineBuilder::SetThreadPool(IThreading* thread_pool) { thread_pool_ = thread_pool; return S_OK; -} \ No newline at end of file +} diff --git a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h index 42cb57190e93f..4bed120df8809 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h +++ b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h @@ -17,6 +17,9 @@ class OnnxruntimeEngineBuilder : public Microsoft::WRL::RuntimeClass< STDMETHOD(SetMetacommandsEnabled) (int enabled); + STDMETHOD(SetBfcAllocatorEnabled) + (int enabled); + STDMETHOD(GetD3D12Device) (_Outptr_ ID3D12Device** device); @@ -47,10 +50,11 @@ class OnnxruntimeEngineBuilder : public Microsoft::WRL::RuntimeClass< Microsoft::WRL::ComPtr queue_ = nullptr; Microsoft::WRL::ComPtr thread_pool_ = nullptr; bool metacommands_enabled_ = true; + bool bfc_allocator_enabled_ = true; std::optional batch_size_override_; wfc::IMapView named_dimension_overrides_; std::optional intra_op_num_threads_override_; bool allow_thread_spinning_ = true; }; -} // namespace _winml \ No newline at end of file +} // namespace _winml diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp index f086f756ed990..24b3aff92824e 100644 --- a/winml/lib/Api/LearningModelSession.cpp +++ b/winml/lib/Api/LearningModelSession.cpp @@ -13,6 +13,7 @@ #include "LearningModelSessionOptions.h" #include "TensorFeatureDescriptor.h" #include "TelemetryEvent.h" +#include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h" #include "D3DDeviceCache.h" @@ -31,7 +32,7 @@ LearningModelSession::LearningModelSession(_winml::IEngine* engine) : operator_r model_(nullptr), device_(LearningModelDeviceKind::Cpu), session_options_(nullptr) -{ +{ engine_.copy_from(engine); } @@ -117,6 +118,10 @@ void LearningModelSession::Initialize() { if (device_impl->IsCpuDevice() == false) { WINML_THROW_IF_FAILED(engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue())); WINML_THROW_IF_FAILED(engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled())); + + winrt::com_ptr registryPrivate; + WINML_THROW_IF_FAILED(model_impl->GetOperatorRegistry()->QueryInterface(IID_PPV_ARGS(registryPrivate.put()))); + WINML_THROW_IF_FAILED(engine_builder->SetBfcAllocatorEnabled(!registryPrivate->HasExternalOperators())); } auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads(); @@ -137,7 +142,7 @@ void LearningModelSession::Initialize() { allow_spinning = session_options_impl->GetIntraOpThreadSpinning(); num_intra_op_threads = session_options_impl->GetIntraOpNumThreads(); } - + bool create_local_thread_pool = allow_spinning != device_impl->AllowSpinning() || num_intra_op_threads != device_impl->NumberOfIntraOpThreads(); if (create_local_thread_pool) { @@ -464,4 +469,4 @@ winml::LearningModelSession LearningModelSession::CreateInertSession(_winml::IEn return winrt::make(engine); } -} // namespace WINMLP \ No newline at end of file +} // namespace WINMLP diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h index a686b585841c3..66fffcd1dd043 100644 --- a/winml/lib/Common/inc/iengine.h +++ b/winml/lib/Common/inc/iengine.h @@ -209,7 +209,7 @@ IThreading : IUnknown { }; -MIDL_INTERFACE("8ac0b6b9-4561-492b-b63d-a07bdd8292c6") +MIDL_INTERFACE("edf7b6d1-f788-4057-9f99-28f9b05360e8") IEngineBuilder : IUnknown { STDMETHOD(SetD3D12Resources) (ID3D12Device * device, ID3D12CommandQueue * queue) PURE; @@ -217,6 +217,9 @@ IEngineBuilder : IUnknown { STDMETHOD(SetMetacommandsEnabled) (int enabled) PURE; + STDMETHOD(SetBfcAllocatorEnabled) + (int enabled) PURE; + STDMETHOD(GetD3D12Device) (_Outptr_ ID3D12Device * *device) PURE; diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp index 3069c618b1738..d57418b6e686f 100644 --- a/winml/test/adapter/AdapterDmlEpTest.cpp +++ b/winml/test/adapter/AdapterDmlEpTest.cpp @@ -71,7 +71,7 @@ UniqueOrtSession CreateDmlSession() { command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put()))); - THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false, true), ort_api); return CreateUniqueOrtSession(FileHelpers::GetModulePath() + L"fns-candy.onnx", session_options); } @@ -218,7 +218,7 @@ void DmlCopyTensor() { command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put()))); - THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false, true), ort_api); auto session = CreateUniqueOrtSession(FileHelpers::GetModulePath() + L"fns-candy.onnx", session_options); OrtExecutionProvider* dml_provider; diff --git a/winml/test/adapter/AdapterSessionTest.cpp b/winml/test/adapter/AdapterSessionTest.cpp index 941157f4f9ecb..bb90ca3656192 100644 --- a/winml/test/adapter/AdapterSessionTest.cpp +++ b/winml/test/adapter/AdapterSessionTest.cpp @@ -92,7 +92,7 @@ void AppendExecutionProvider_DML() { const auto device = CreateD3DDevice(); const auto queue = CreateD3DQueue(device.get()); - THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true, true), ort_api); } void CreateWithoutModel() { @@ -114,7 +114,7 @@ void GetExecutionProvider_DML() { THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api); const auto device = CreateD3DDevice(); const auto queue = CreateD3DQueue(device.get()); - THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true, true), ort_api); const auto model_path = FileHelpers::GetModulePath() + L"fns-candy.onnx"; auto session = CreateUniqueOrtSession(model_path, session_options); @@ -254,7 +254,7 @@ void CopyInputAcrossDevices_DML() { THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api); const auto device = CreateD3DDevice(); const auto queue = CreateD3DQueue(device.get()); - THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true, true), ort_api); auto session = CreateUniqueOrtSession(session_options); LoadAndPurloinModel(session, "fns-candy.onnx"); From b06678a9cb986e0e9e102d5f15951d5ede3b6cf9 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 08:08:04 -0700 Subject: [PATCH 50/76] Fix crash --- winml/lib/Api/LearningModelSession.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp index 24b3aff92824e..8dfb0769d3adf 100644 --- a/winml/lib/Api/LearningModelSession.cpp +++ b/winml/lib/Api/LearningModelSession.cpp @@ -119,9 +119,12 @@ void LearningModelSession::Initialize() { WINML_THROW_IF_FAILED(engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue())); WINML_THROW_IF_FAILED(engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled())); - winrt::com_ptr registryPrivate; - WINML_THROW_IF_FAILED(model_impl->GetOperatorRegistry()->QueryInterface(IID_PPV_ARGS(registryPrivate.put()))); - WINML_THROW_IF_FAILED(engine_builder->SetBfcAllocatorEnabled(!registryPrivate->HasExternalOperators())); + if (model_impl->GetOperatorRegistry()) + { + winrt::com_ptr registryPrivate; + WINML_THROW_IF_FAILED(model_impl->GetOperatorRegistry()->QueryInterface(IID_PPV_ARGS(registryPrivate.put()))); + WINML_THROW_IF_FAILED(engine_builder->SetBfcAllocatorEnabled(!registryPrivate->HasExternalOperators())); + } } auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads(); From bf177f656590d8f011995136649d44b61ec8490e Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 10:33:20 -0700 Subject: [PATCH 51/76] Fix prefast error --- winml/lib/Api.Image/VideoFrameToTensorConverter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 5536f7df203b7..4767228579b0b 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -498,8 +498,8 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( assert(convertedSoftwareBitmap != nullptr); - uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; - uint32_t bufferSize = static_cast(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize); + uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; + uint64_t bufferSize = tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize; // TODO: Make an allocator for upload heaps if (!upload_heap_ || upload_heap_->GetDesc().Width < bufferSize) { From cb2e420ce9de26a7af38f26f6490144070499f64 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 11:42:57 -0700 Subject: [PATCH 52/76] Fix Bucketized allocator crash --- .../DmlExecutionProvider/src/BucketizedBufferAllocator.cpp | 5 +++++ .../src/DmlReservedResourceSubAllocator.cpp | 3 +-- .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index c071e4bf0b8d3..675f17e4c28af 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -96,6 +96,11 @@ namespace Dml D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const { auto allocationInfo = static_cast(opaquePointer); + + // Make sure that we are aligned to 4 bytes to satisfy DML's requirements + constexpr uint64_t DML_ALIGNMENT = 4; + size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; + return D3D12BufferRegion(0, size_in_bytes, allocationInfo->GetD3D12Resource()); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index e58303a0bfbfc..0a63146286336 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -359,8 +359,7 @@ namespace Dml // Make sure that we are aligned to 4 bytes to satisfy DML's requirements constexpr uint64_t DML_ALIGNMENT = 4; - size_in_bytes = - (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; + size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; // Make sure the region we're trying to create fits entirely in the resource assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 9ac474474f8b8..888f672d34d0c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -152,7 +152,7 @@ namespace Dml : m_d3d12Device(d3d12Device), m_dmlDevice(dmlDevice), m_areMetacommandsEnabled(enableMetacommands), - m_bfcAllocatorEnabled(enableBfcAllocator), + m_bfcAllocatorEnabled(false), // TODO (pavignol): Revert m_queue(queue) { @@ -223,6 +223,7 @@ namespace Dml D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + // TODO (pavignol): Remove if (!m_bfcAllocatorEnabled) { printf("*************BFC ALLOCATOR DISABLED!\n"); From 9c79b1b30d76beefd3e3119b425588578dabb553 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 13:32:38 -0700 Subject: [PATCH 53/76] Address prefast errors --- .../src/BucketizedBufferAllocator.h | 5 ++-- .../src/DmlAllocationInfo.h | 2 +- .../DmlExecutionProvider/src/DmlBuffer.cpp | 8 +++---- .../dml/DmlExecutionProvider/src/DmlBuffer.h | 7 +++--- .../src/DmlBufferRegion.cpp | 6 ++--- .../src/DmlBufferRegion.h | 6 ++--- .../src/DmlGpuAllocator.cpp | 2 ++ .../src/DmlReservedResourceSubAllocator.cpp | 6 ++--- .../src/DmlReservedResourceSubAllocator.h | 5 ++-- .../src/DmlResourceWrapper.h | 1 - .../src/ExecutionContext.cpp | 2 -- .../src/ExecutionProvider.cpp | 24 +++++++------------ .../src/FusedGraphKernel.cpp | 4 ++-- .../src/MLOperatorAuthorImpl.cpp | 10 ++++---- .../src/Operators/DmlDFT.h | 22 ++++++++--------- .../src/Operators/DmlGridSample.h | 16 ++++++------- .../src/Operators/DmlSTFT.h | 6 ++--- .../providers/dml/dml_provider_factory.cc | 2 +- .../Api.Image/TensorToVideoFrameConverter.cpp | 6 ++--- .../Api.Image/VideoFrameToTensorConverter.cpp | 2 +- 20 files changed, 68 insertions(+), 74 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index f0fc570d4e1c4..17524c83c6094 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -19,8 +19,6 @@ namespace Dml class BucketizedBufferAllocator : public onnxruntime::IAllocator, public DmlSubAllocator { public: - ~BucketizedBufferAllocator(); - // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties, // resource flags, and initial resource state. BucketizedBufferAllocator( @@ -46,6 +44,9 @@ namespace Dml void* Alloc(size_t size) final; void Free(void* p) final; + protected: + ~BucketizedBufferAllocator(); + private: static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h index ee203ba47056e..f61e59edd5159 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h @@ -38,7 +38,7 @@ namespace Dml return m_resourceWrapper->GetD3D12Resource(); } - ComPtr DetachResourceWrapper() const + ComPtr DetachResourceWrapper() { return std::move(m_resourceWrapper); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index 0b670a22f9cbd..21b5da96ce236 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -25,7 +25,7 @@ DmlBuffer::~DmlBuffer() } } -DmlBuffer::DmlBuffer(DmlBuffer&& other) +DmlBuffer::DmlBuffer(DmlBuffer&& other) noexcept { m_opaqueData = other.m_opaqueData; allocator_ = other.allocator_; @@ -33,7 +33,7 @@ DmlBuffer::DmlBuffer(DmlBuffer&& other) other.m_opaqueData = nullptr; } -DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) +DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) noexcept { m_opaqueData = other.m_opaqueData; allocator_ = other.allocator_; @@ -42,9 +42,9 @@ DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) return *this; } -ID3D12Resource* DmlBuffer::ResourceInUavState() const +ID3D12Resource* DmlBuffer::GetD3D12Resource() const { - return buffer_region_.ResourceInUavState(); + return buffer_region_.GetD3D12Resource(); } uint64_t DmlBuffer::Offset() const diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index 4b0dd58ce4467..019d186441da5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -24,11 +24,10 @@ class DmlBuffer // Move-only DmlBuffer(const DmlBuffer&) = delete; DmlBuffer& operator=(const DmlBuffer&) = delete; - DmlBuffer(DmlBuffer&&); - DmlBuffer& operator=(DmlBuffer&&); + DmlBuffer(DmlBuffer&&) noexcept; + DmlBuffer& operator=(DmlBuffer&&) noexcept; - // TODO (pavignol): Rename to Resource() - ID3D12Resource* ResourceInUavState() const; + ID3D12Resource* GetD3D12Resource() const; uint64_t Offset() const; uint64_t SizeInBytes() const; const D3D12BufferRegion& Region() const { return buffer_region_; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index c33cc5491c7f0..57c4d5b342bb8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -27,14 +27,14 @@ namespace Dml assert(m_resource->GetDesc().Width == buffer_size); } - D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) + D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) noexcept { std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); std::swap(this->size_in_bytes_, that.size_in_bytes_); } - D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) + D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) noexcept { std::swap(this->m_resource, that.m_resource); std::swap(this->offset_, that.offset_); @@ -42,7 +42,7 @@ namespace Dml return *this; } - ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const + ID3D12Resource* D3D12BufferRegion::GetD3D12Resource() const { return m_resource; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index 6c5cb37297caa..40c41f980b011 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -24,9 +24,9 @@ namespace Dml // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default; - D3D12BufferRegion(D3D12BufferRegion&&); - D3D12BufferRegion& operator=(D3D12BufferRegion&&); - ID3D12Resource* ResourceInUavState() const; + D3D12BufferRegion(D3D12BufferRegion&&) noexcept; + D3D12BufferRegion& operator=(D3D12BufferRegion&&) noexcept; + ID3D12Resource* GetD3D12Resource() const; uint64_t Offset() const; uint64_t SizeInBytes() const; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 5ac6485a041ec..881478c3e874f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -89,8 +89,10 @@ namespace Dml { case ActiveAllocator::BfcAllocator: m_bfcSubAllocator->SetDefaultRoundingMode(roundingMode); + break; case ActiveAllocator::BucketizedBufferAllocator: m_bucketizedBufferAllocator->SetDefaultRoundingMode(roundingMode); + break; default: ORT_THROW_HR(E_UNEXPECTED); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index 0a63146286336..4fdd6411d555a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -148,9 +148,9 @@ namespace Dml // Target range in the current heap to map. const D3D12_TILE_RANGE_FLAGS tile_range_flags = D3D12_TILE_RANGE_FLAG_NONE; - const uint32_t heap_range_start_offset = 0; const uint32_t heap_range_tile_count = static_cast(heap_size_in_tiles); + constexpr uint32_t heap_range_start_offset = 0; constexpr uint32_t numResourceRegions = 1; constexpr uint32_t numHeapRanges = 1; @@ -262,8 +262,8 @@ namespace Dml #endif // DML only has a single device in ORT at the moment - const uint64_t device_id = 0; - const uint64_t offset = 0; + constexpr uint64_t device_id = 0; + constexpr uint64_t offset = 0; return TaggedPointer::Pack(device_id, *allocationId, offset); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 3f2f1c9210c64..249de73de6487 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -70,8 +70,6 @@ namespace Dml bool TilingEnabled() const { return tiling_enabled_; }; uint64_t GetUniqueId(void* opaquePointer); - ~DmlReservedResourceSubAllocator(); - // Constructs a DmlReservedResourceSubAllocator which allocates D3D12 committed resources with the specified heap properties, // resource flags, and initial resource state. DmlReservedResourceSubAllocator( @@ -83,6 +81,9 @@ namespace Dml void* Alloc(size_t size); void Free(void* p); + protected: + ~DmlReservedResourceSubAllocator(); + private: static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h index 6ad57b055023c..876487242aa37 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h @@ -11,7 +11,6 @@ namespace Dml DmlResourceWrapper : public IUnknown { public: - // TODO (pavignol): Rename to GetResource() virtual ID3D12Resource* GetD3D12Resource() const = 0; virtual ~DmlResourceWrapper(){} }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index 4ff464e0eef42..86f964651b638 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -43,8 +43,6 @@ namespace Dml // cannot be both in a source and destination state at the same time (without aliasing), we copy // the source resource to an intermediate one, and then copy the intermediate resource to the // destination resource. - // TODO (pavignol): Only do the intermediate copy when both resources at the same - D3D12_HEAP_PROPERTIES heapProperties = { D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 888f672d34d0c..e8b198d369c01 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -129,7 +129,7 @@ namespace Dml { ORT_TRY { - return GetBufferForTensor(tensor).ResourceInUavState(); + return GetBufferForTensor(tensor).GetD3D12Resource(); } ORT_CATCH_GENERIC { @@ -152,7 +152,7 @@ namespace Dml : m_d3d12Device(d3d12Device), m_dmlDevice(dmlDevice), m_areMetacommandsEnabled(enableMetacommands), - m_bfcAllocatorEnabled(false), // TODO (pavignol): Revert + m_bfcAllocatorEnabled(enableBfcAllocator), m_queue(queue) { @@ -223,12 +223,6 @@ namespace Dml D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - // TODO (pavignol): Remove - if (!m_bfcAllocatorEnabled) - { - printf("*************BFC ALLOCATOR DISABLED!\n"); - } - // Wrap the BFC allocator into our own allocator m_gpuAllocator = std::make_shared( m_bfcAllocator.get(), @@ -469,7 +463,7 @@ namespace Dml // CPU -> GPU copy (upload) // auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + ID3D12Resource* dstData = dstBufferRegion.GetD3D12Resource(); const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes)); @@ -481,7 +475,7 @@ namespace Dml // GPU -> CPU copy (readback) // auto srcBufferRegion = GetBufferForTensor(src); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource(); const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t srcOffset = srcBufferRegion.Offset(); m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState); @@ -492,12 +486,12 @@ namespace Dml // GPU -> GPU copy // auto srcBufferRegion = GetBufferForTensor(src); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource(); const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t srcOffset = srcBufferRegion.Offset(); auto dstBufferRegion = GetBufferForTensor(dst); - ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState(); + ID3D12Resource* dstData = dstBufferRegion.GetD3D12Resource(); const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; const uint64_t dstOffset = dstBufferRegion.Offset(); @@ -554,7 +548,7 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(src[i]); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource(); const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); @@ -581,7 +575,7 @@ namespace Dml if (mlTensor != nullptr) { auto dstBufferRegion = GetBufferForTensor(dst); - m_context->FillBufferWithPattern(dstBufferRegion.ResourceInUavState(), dstBufferRegion.Offset(), rawValue); + m_context->FillBufferWithPattern(dstBufferRegion.GetD3D12Resource(), dstBufferRegion.Offset(), rawValue); } return S_OK; @@ -982,7 +976,7 @@ namespace Dml auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState(); + ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource(); const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; srcDatas.push_back(srcData); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index b00b8f8e19f52..e60845f8bb146 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -65,7 +65,7 @@ namespace Dml if (persistentResourceSize > 0) { auto buffer = m_provider->AllocatePooledResource(persistentResourceSize); - m_persistentResource = buffer.ResourceInUavState(); + m_persistentResource = buffer.GetD3D12Resource(); m_persistentResourceBinding = buffer.GetBufferBinding(); m_managedPersistentBuffer = wil::MakeOrThrow(std::move(buffer)); m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get()); @@ -187,7 +187,7 @@ namespace Dml { bufferBindings.push_back(bufferRegion.GetBufferBinding()); - if (bufferRegion.ResourceInUavState() != nullptr) + if (bufferRegion.GetD3D12Resource() != nullptr) { bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() }); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 1547bd99b6e20..86069139d7f69 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1710,8 +1710,8 @@ namespace Windows::AI::MachineLearning::Adapter else { auto bufferRegion = GetBufferRegion(); - bufferRegion.ResourceInUavState()->AddRef(); - *dataInterface = bufferRegion.ResourceInUavState(); + bufferRegion.GetD3D12Resource()->AddRef(); + *dataInterface = bufferRegion.GetD3D12Resource(); } } @@ -1768,7 +1768,7 @@ namespace Windows::AI::MachineLearning::Adapter for (auto& tempBuffer : m_temporaryBuffers) { - resourcesToTransition.push_back(tempBuffer.ResourceInUavState()); + resourcesToTransition.push_back(tempBuffer.GetD3D12Resource()); } m_winmlProvider->TransitionResourcesForOperator( @@ -2146,8 +2146,8 @@ namespace Windows::AI::MachineLearning::Adapter auto dml_gpu_allocator = static_cast(alloc.get()); auto buffer = dml_gpu_allocator->AllocateDefaultBuffer(size); - buffer.ResourceInUavState()->AddRef(); - *abiAllocation = buffer.ResourceInUavState(); + buffer.GetD3D12Resource()->AddRef(); + *abiAllocation = buffer.GetD3D12Resource(); // Ensure the allocation is freed and transitioned when the context destructs m_temporaryBuffers.push_back(std::move(buffer)); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index 1e3035648adcb..9c8d8b4d539c9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -736,13 +736,13 @@ class GpuDFTOperator : public WRL::Base D3D12_RESOURCE_BARRIER barriers[2]; barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), + inputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), + outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); @@ -786,13 +786,13 @@ class GpuDFTOperator : public WRL::Base // Transition resources to common state barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), + inputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), + outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); @@ -821,13 +821,13 @@ class GpuDFTOperator : public WRL::Base D3D12_RESOURCE_BARRIER barriers[2]; barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), + inputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), + outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); @@ -870,7 +870,7 @@ class GpuDFTOperator : public WRL::Base constants.ElementCount = totalElementCount / constants.OutputSizes[3]; constants.DFTIteration = index + 1; constants.ChirpLength = isLastPass ? chirpLength : 0; - constants.HasWindow = isFirstPass && windowBufferRegion.ResourceInUavState() != nullptr; + constants.HasWindow = isFirstPass && windowBufferRegion.GetD3D12Resource() != nullptr; auto window = constants.HasWindow ? windowBufferRegion : out; std::array uav_resources = { in, out, window }; Dispatch(uav_resources, constants, commandList); @@ -878,13 +878,13 @@ class GpuDFTOperator : public WRL::Base // Transition resources to common state barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), + inputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), + outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); @@ -911,7 +911,7 @@ class GpuDFTOperator : public WRL::Base std::transform( bufferRegions.begin(), bufferRegions.end(), uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.GetD3D12Resource()); } ); commandList->ResourceBarrier(TSize, uav_barriers); for (uint32_t i = 0; i < TSize; i++) @@ -920,7 +920,7 @@ class GpuDFTOperator : public WRL::Base if (bufferRegions[i]) { commandList->SetComputeRootUnorderedAccessView( i, // root parameter index - bufferRegions[i].ResourceInUavState()->GetGPUVirtualAddress() + bufferRegions[i].Offset() + bufferRegions[i].GetD3D12Resource()->GetGPUVirtualAddress() + bufferRegions[i].Offset() ); } else diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 0611c4b7bf7f7..29cf439284479 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -687,19 +687,19 @@ class DmlGridSampleOperator : public WRL::Base D3D12_RESOURCE_BARRIER barriers[3]; barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), + inputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridBufferRegion.ResourceInUavState(), + gridBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), + outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); @@ -729,19 +729,19 @@ class DmlGridSampleOperator : public WRL::Base // Transition resources to common state barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition( - inputBufferRegion.ResourceInUavState(), + inputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition( - gridBufferRegion.ResourceInUavState(), + gridBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition( - outputBufferRegion.ResourceInUavState(), + outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COMMON ); @@ -768,7 +768,7 @@ class DmlGridSampleOperator : public WRL::Base std::transform( bufferRegions.begin(), bufferRegions.end(), uav_barriers, - [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } ); + [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.GetD3D12Resource()); } ); commandList->ResourceBarrier(TSize, uav_barriers); for (uint32_t i = 0; i < TSize; i++) @@ -777,7 +777,7 @@ class DmlGridSampleOperator : public WRL::Base if (bufferRegions[i]) { commandList->SetComputeRootUnorderedAccessView( i, // root parameter index - bufferRegions[i].ResourceInUavState()->GetGPUVirtualAddress() + bufferRegions[i].Offset() + bufferRegions[i].GetD3D12Resource()->GetGPUVirtualAddress() + bufferRegions[i].Offset() ); } else diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index 945b58965cf2f..780a6fe0f5223 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -413,7 +413,7 @@ class DmlSTFTOperator : public WRL::Base Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal); inputBuffers[0] = signalBufferRegion.GetBufferBinding(); inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Dml::D3D12BufferRegion windowBufferRegion; if (m_framingOperator.hasWindowTensor) @@ -421,7 +421,7 @@ class DmlSTFTOperator : public WRL::Base windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window); inputBuffers[1] = windowBufferRegion.GetBufferBinding(); inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); inputBindingsCount++; } @@ -429,7 +429,7 @@ class DmlSTFTOperator : public WRL::Base DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding(); DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer }; - barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); m_framingOperator.bindingTable->BindOutputs(1, &outputBinding); diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index b2d02715bb91d..6d723a6a6b948 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -266,7 +266,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation, ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT); auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), allocation, size_in_bytes); *offset = bufferRegion.Offset(); - *d3d_resource = bufferRegion.ResourceInUavState(); + *d3d_resource = bufferRegion.GetD3D12Resource(); } (*d3d_resource)->AddRef(); diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp index a2d7ff6947f9a..5b0cbd414dd7a 100644 --- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp +++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp @@ -564,8 +564,8 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( telemetryLogger.emplace(tensorDesc); } - uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; - uint32_t singleVideoFramebufferSize = static_cast(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize); + uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; + uint64_t singleVideoFramebufferSize = tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize; // TODO: Make an allocator for readback heaps if (!readback_heap_ || readback_heap_->GetDesc().Width < singleVideoFramebufferSize) { @@ -582,7 +582,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pInputTensor, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE); command_list_->ResourceBarrier(1, &barrier); - command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, inputTensorOffset + singleVideoFramebufferSize * batchIdx, singleVideoFramebufferSize); + command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, inputTensorOffset + singleVideoFramebufferSize * static_cast(batchIdx), singleVideoFramebufferSize); WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 4767228579b0b..becb47ed6d56c 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -527,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST); command_list_->ResourceBarrier(1, &barrier); - command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx + outputResourceOffset, upload_heap_.Get(), 0, bufferSize); + command_list_->CopyBufferRegion(pOutputResource, bufferSize * static_cast(batchIdx) + outputResourceOffset, upload_heap_.Get(), 0, bufferSize); WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; From 8f37e382260c91019aa8a53f7b8a24834b973d24 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 13:41:49 -0700 Subject: [PATCH 54/76] Fix destructors --- .../dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h | 5 ++--- .../src/DmlReservedResourceSubAllocator.h | 5 ++--- .../providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 17524c83c6094..f0fc570d4e1c4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -19,6 +19,8 @@ namespace Dml class BucketizedBufferAllocator : public onnxruntime::IAllocator, public DmlSubAllocator { public: + ~BucketizedBufferAllocator(); + // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties, // resource flags, and initial resource state. BucketizedBufferAllocator( @@ -44,9 +46,6 @@ namespace Dml void* Alloc(size_t size) final; void Free(void* p) final; - protected: - ~BucketizedBufferAllocator(); - private: static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 249de73de6487..3f2f1c9210c64 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -70,6 +70,8 @@ namespace Dml bool TilingEnabled() const { return tiling_enabled_; }; uint64_t GetUniqueId(void* opaquePointer); + ~DmlReservedResourceSubAllocator(); + // Constructs a DmlReservedResourceSubAllocator which allocates D3D12 committed resources with the specified heap properties, // resource flags, and initial resource state. DmlReservedResourceSubAllocator( @@ -81,9 +83,6 @@ namespace Dml void* Alloc(size_t size); void Free(void* p); - protected: - ~DmlReservedResourceSubAllocator(); - private: static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h index 580830ea1a90f..d6aa49d51c3f8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h @@ -11,5 +11,6 @@ namespace Dml { public: virtual void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) = 0; + virtual ~DmlSubAllocator() = default; }; } From a67641c259aa94fab8417ce9d6b167cae96363ff Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 13 Jul 2023 17:11:25 -0700 Subject: [PATCH 55/76] Fix typo --- .../providers/dml/DmlExecutionProvider/src/CommandQueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp index e5084772d4063..95190e9dca2a2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp @@ -51,7 +51,7 @@ namespace Dml // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK // to queue additional references at this time, since those references would be leaked. This // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; - // for example, an allocation from DmlReservedResourceSubAllocator attempts to queue a reference + // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference // to its underlying D3D resource when freed. Furthermore, these references are unnecessary // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences. if (!m_closing) From e4e34e0cb513dfdf4bf0e665d1032bbf1e0d6acc Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Sat, 29 Jul 2023 14:08:07 -0700 Subject: [PATCH 56/76] Fix build break --- winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp | 14 +++----------- winml/test/common/SqueezeNetValidator.cpp | 7 ------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp index 5c96cb750f2b9..a985aa002c915 100644 --- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp @@ -39,29 +39,21 @@ OnnxruntimeDmlSessionBuilder::CreateSessionOptions(OrtSessionOptions** options) auto session_options = UniqueOrtSessionOptions(ort_options, ort_api->ReleaseSessionOptions); - // set the graph optimization level to all (used to be called level 3) + // set the graph optimization level to all (used to be called level 3) RETURN_HR_IF_NOT_OK_MSG( ort_api->SetSessionGraphOptimizationLevel(session_options.get(), GraphOptimizationLevel::ORT_ENABLE_ALL), ort_api ); - // Disable the mem pattern session option for DML. It will cause problems with how memory is allocated. + // Disable the mem pattern session option for DML. It will cause problems with how memory is allocated. RETURN_HR_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api); - // Request the dml ep -<<<<<<< HEAD + // Request the dml ep RETURN_HR_IF_NOT_OK_MSG( winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( session_options.get(), device_.get(), queue_.get(), metacommands_enabled_, bfc_allocator_enabled_ ), ort_api ); - == == == = RETURN_HR_IF_NOT_OK_MSG( - winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( - session_options.get(), device_.get(), queue_.get(), metacommands_enabled_ - ), - ort_api - ); ->>>>>>> 92b6e10d37c50638d59620c5a315c6e75b47131c #ifndef _WIN64 auto use_arena = false; diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp index 5c50651d92e6b..d0c43c1c9775e 100644 --- a/winml/test/common/SqueezeNetValidator.cpp +++ b/winml/test/common/SqueezeNetValidator.cpp @@ -190,13 +190,6 @@ void ModelValidator::SqueezeNet( LearningModelSession modelSession = nullptr; modelSession = LearningModelSession(model, LearningModelDevice(deviceKind)); - // WinML model creation - LearningModel model = nullptr; - model = LearningModel::LoadFromFilePath(fullModelPath); - - LearningModelSession modelSession = nullptr; - modelSession = LearningModelSession(model, LearningModelDevice(deviceKind)); - LearningModelBinding modelBinding(modelSession); if (bindAsImage) { From 0b4cee09972706f17f73a86fd550fa7a09ebe96a Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Sat, 29 Jul 2023 20:55:27 -0700 Subject: [PATCH 57/76] Fix lint errors --- winml/adapter/winml_adapter_c_api.h | 6 +++++- winml/adapter/winml_adapter_dml.cpp | 2 +- winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h | 4 +--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/winml/adapter/winml_adapter_c_api.h b/winml/adapter/winml_adapter_c_api.h index d8fd6276a75f7..fac53da21b668 100644 --- a/winml/adapter/winml_adapter_c_api.h +++ b/winml/adapter/winml_adapter_c_api.h @@ -294,7 +294,11 @@ struct WinmlAdapterApi { * This api is used to add the DML EP to OrtSessionOptions. */ OrtStatus*(ORT_API_CALL* OrtSessionOptionsAppendExecutionProvider_DML)( - _In_ OrtSessionOptions* options, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled, bool bfc_allocator_enabled + _In_ OrtSessionOptions* options, + ID3D12Device* device, + ID3D12CommandQueue* queue, + bool metacommands_enabled, + bool bfc_allocator_enabled )NO_EXCEPTION; // OrtSession methods diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp index 25bbe95a66a2c..f32507a308e3c 100644 --- a/winml/adapter/winml_adapter_dml.cpp +++ b/winml/adapter/winml_adapter_dml.cpp @@ -73,7 +73,7 @@ void DmlConfigureProviderFactoryDefaultRoundingMode( ); void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled); void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled); -} +} // namespace onnxruntime #endif // USE_DML diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h index 167aeffb483ec..4433dfaab299d 100644 --- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h +++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h @@ -86,9 +86,7 @@ class VideoFrameToTensorConverter : public ImageConverter { ); static D3D12_UNORDERED_ACCESS_VIEW_DESC CreateUAVDescription( - uint64_t offset, - const UINT32 batch_index, - const ImageTensorDescription& description + uint64_t offset, const UINT32 batch_index, const ImageTensorDescription& description ); static void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor( From e14797c5e84ab4cd536036353e84ac286820a1fd Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Sat, 29 Jul 2023 22:15:32 -0700 Subject: [PATCH 58/76] Fix iobinding crash --- .../src/DmlExternalGpuAllocator.cpp | 43 ++++++++++++++++--- .../src/DmlExternalGpuAllocator.h | 5 ++- .../src/ExecutionProvider.cpp | 2 +- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp index 0ebe2c3d00e5e..c30b4d19d2f73 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp @@ -5,29 +5,58 @@ #include "precomp.h" #include "DmlExternalGpuAllocator.h" +#include "DmlResourceWrapper.h" +#include "DmlCommittedResourceWrapper.h" +#include "DmlAllocationInfo.h" namespace Dml { - DmlExternalGpuAllocator::DmlExternalGpuAllocator() + DmlExternalGpuAllocator::DmlExternalGpuAllocator(ID3D12Device* device) : onnxruntime::IAllocator( OrtMemoryInfo( onnxruntime::DML, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, 0), -1 - ) - ) {} + )), + m_device(device) + { + } void* DmlExternalGpuAllocator::Alloc(size_t size_in_bytes) { - // This allocator should never be used to allocate memory; it should only be use to decode the opaque data pointer - THROW_HR(E_INVALIDARG); + Microsoft::WRL::ComPtr resource; + auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + auto props = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT); + ORT_THROW_IF_FAILED(m_device->CreateCommittedResource( + &props, + D3D12_HEAP_FLAG_NONE, + &buffer, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + nullptr, + IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf()) + )); + + const uint64_t resourceWidth = resource->GetDesc().Width; + constexpr uint64_t pooledResourceId = 0; // Not a pooled resource + + Microsoft::WRL::ComPtr resourceWrapper; + wil::MakeOrThrow(std::move(resource)).As(&resourceWrapper); + + Microsoft::WRL::ComPtr allocInfo = wil::MakeOrThrow( + nullptr, + 0, + pooledResourceId, + resourceWrapper.Get(), + static_cast(resourceWidth)); + + return allocInfo.Detach(); } void DmlExternalGpuAllocator::Free(void* ptr) { - // This allocator should never be used to free memory; it should only be use to decode the opaque data pointer - THROW_HR(E_INVALIDARG); + Microsoft::WRL::ComPtr resource; + resource.Attach(static_cast(ptr)); } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h index 1c4d4b36628eb..9dbb87ef04aa2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h @@ -14,9 +14,12 @@ namespace Dml class DmlExternalGpuAllocator : public onnxruntime::IAllocator { public: - DmlExternalGpuAllocator(); + DmlExternalGpuAllocator(ID3D12Device* device); void* Alloc(size_t size_in_bytes) final; void Free(void* ptr) final; + + private: + Microsoft::WRL::ComPtr m_device; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index e8b198d369c01..d824aa8185705 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -232,7 +232,7 @@ namespace Dml m_context->SetAllocator(m_gpuAllocator); // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators. m_cpuInputAllocator = std::make_shared(OrtMemType::OrtMemTypeCPUInput); - m_externalGpuAllocator = std::make_shared(); + m_externalGpuAllocator = std::make_shared(m_d3d12Device.Get()); } return std::vector{m_gpuAllocator, m_externalGpuAllocator, m_cpuInputAllocator}; From 16c9524ba865f2d16192963a227c8e1db0af01c7 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 31 Jul 2023 18:05:17 -0700 Subject: [PATCH 59/76] Add aliasing support to DmlOperatorCopy --- .../src/MLOperatorAuthorImpl.cpp | 18 ++++++- .../src/MLOperatorAuthorImpl.h | 12 +++-- .../src/Operators/DmlOperatorCopy.cpp | 49 ++++++++++++++++--- .../MLOperatorAuthorPrivate.h | 12 +++++ 4 files changed, 80 insertions(+), 11 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 86069139d7f69..320f527f7ea46 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -557,7 +557,7 @@ namespace Windows::AI::MachineLearning::Adapter const AttributeMap* defaultAttributes, gsl::span requiredConstantCpuInputs, MLOperatorTensorGetter& constantInputGetter, - const onnxruntime::OpKernelContext* kernelContext + onnxruntime::OpKernelContext* kernelContext ) : OpNodeInfoWrapper(kerneInfo, inputShapeOverrides, defaultAttributes, requiredConstantCpuInputs, constantInputGetter, kernelContext), m_inferredOutputShapes(inferredOutputShapes), @@ -1335,6 +1335,22 @@ namespace Windows::AI::MachineLearning::Adapter return m_allowOutputShapeQuery; } + HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputAliasesOutput( + uint32_t inputIndex, + uint32_t outputIndex, + const onnxruntime::TensorShape& outputShape, + bool* aliasing) noexcept + { + ORT_TRY + { + auto inputData = m_kernelContext->Input(inputIndex)->DataRaw(); + auto outputData = m_kernelContext->Output(outputIndex, outputShape)->DataRaw(); + *aliasing = inputData == outputData; + return S_OK; + } + ORT_CATCH_RETURN + } + DmlGraphOpKernelInfoWrapper::DmlGraphOpKernelInfoWrapper( const onnxruntime::OpNodeProtoHelper* protoHelper, const void* executionHandle, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 85b6b197fe511..c378db886f690 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -176,7 +176,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable const AttributeMap* defaultAttributes, gsl::span requiredConstantCpuInputs, MLOperatorTensorGetter& constantInputGetter, - const onnxruntime::OpKernelContext* kernelContext = nullptr + onnxruntime::OpKernelContext* kernelContext = nullptr ) : m_impl(impl), m_kernelContext(kernelContext), @@ -245,7 +245,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable protected: // Lifetime is managed by the caller and guaranteed to outlive this class const onnxruntime::OpNodeProtoHelper* m_impl = nullptr; - const onnxruntime::OpKernelContext* m_kernelContext = nullptr; + mutable onnxruntime::OpKernelContext* m_kernelContext = nullptr; private: template @@ -362,7 +362,7 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper< const AttributeMap* defaultAttributes, gsl::span requiredConstantCpuInputs, MLOperatorTensorGetter& constantInputGetter, - const onnxruntime::OpKernelContext* kernelContext = nullptr + onnxruntime::OpKernelContext* kernelContext = nullptr ); // HasTensorShapeDescription returns false if and only if the kernel is registered using @@ -405,6 +405,12 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper< return m_winmlProvider.CopyTo(executionProvider); } + HRESULT STDMETHODCALLTYPE InputAliasesOutput( + uint32_t inputIndex, + uint32_t outputIndex, + const onnxruntime::TensorShape& outputShape, + bool* aliasing) noexcept override; + private: // For shape info, in addition to the info const EdgeShapes* m_inferredOutputShapes = nullptr; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index f8ef496b74d9b..7ac944616d26b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -16,7 +16,7 @@ class DmlOperatorCopy : public DmlOperator ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() >= 1); ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() == 1); - std::vector> kernelInputOutputIndices = {0}; + std::vector> kernelInputOutputIndices = {0}; Initialize(kernelInfo, kernelInputOutputIndices); @@ -29,14 +29,49 @@ class DmlOperatorCopy : public DmlOperator ComPtr contextPrivate; ORT_THROW_IF_FAILED(kernelInfo.GetInterface()->QueryInterface(contextPrivate.GetAddressOf())); - std::vector inputDescs = GetDmlInputDescs(); - std::vector outputDescs = GetDmlOutputDescs(); + // We don't need to compile any operator if the input aliases the output as it is essentially a no-op + // (e.g. squeeze/unsqueeze/reshape). An exception to this rule is when the operator is part of the graph, + // in which case we always need to compile and execute the operator (although this is something that we + // could optimize in the future). - DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {}; - opDesc.InputTensor = inputDescs.data(); - opDesc.OutputTensor = outputDescs.data(); + bool aliasing = false; - SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo); + if (!contextPrivate->IsDmlGraphNode()) + { + std::vector outputSizes = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); + std::vector outputSizesInt64(outputSizes.begin(), outputSizes.end()); + onnxruntime::TensorShape outputShape(outputSizesInt64); + ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &aliasing)); + } + + if (!aliasing) + { + std::vector inputDescs = GetDmlInputDescs(); + std::vector outputDescs = GetDmlOutputDescs(); + + DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {}; + opDesc.InputTensor = inputDescs.data(); + opDesc.OutputTensor = outputDescs.data(); + + SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo); + } + } + + void Compute(const MLOperatorKernelContext& kernelContext) + { + MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); + + // Reshape the output tensor. + MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); + + // Avoid self copying. + if (inputTensor.GetByteData() != outputTensor.GetByteData()) + { + // Copy elements from input tensor to output tensor. + ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( + outputTensor.GetInterface().Get(), + inputTensor.GetInterface().Get())); + } } }; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h index 9909be1f8337f..d86fdff8ac7e1 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h @@ -10,6 +10,11 @@ struct DML_INPUT_GRAPH_EDGE_DESC; struct DML_OUTPUT_GRAPH_EDGE_DESC; struct DML_INTERMEDIATE_GRAPH_EDGE_DESC; +namespace onnxruntime +{ + class TensorShape; +} + // Either nodesAsOpDesc or nodesAsIDMLOperator is present. // 1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC. // These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the @@ -106,6 +111,13 @@ IMLOperatorKernelCreationContextNodeWrapperPrivate : public IMLOperatorKernelCre STDMETHOD(GetExecutionProvider)( _Outptr_result_maybenull_ IUnknown** executionProvider ) const noexcept PURE; + + STDMETHOD(InputAliasesOutput)( + _In_ uint32_t inputIndex, + _In_ uint32_t outputIndex, + _In_ const onnxruntime::TensorShape& outputShape, + _Out_ bool* aliasing + ) noexcept PURE; }; //! \interface IMLOperatorAttributes1 From a95505f024b18d3cfe68e41e3751951bfb66aca2 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 31 Jul 2023 19:57:56 -0700 Subject: [PATCH 60/76] Use identity instead of 2 copies --- .../src/Operators/DmlOperatorCopy.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index 7ac944616d26b..bb45cdfef25c1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -57,20 +57,15 @@ class DmlOperatorCopy : public DmlOperator } } - void Compute(const MLOperatorKernelContext& kernelContext) + void Compute(const MLOperatorKernelContext& kernelContext) final { MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); - - // Reshape the output tensor. MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); // Avoid self copying. if (inputTensor.GetByteData() != outputTensor.GetByteData()) { - // Copy elements from input tensor to output tensor. - ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( - outputTensor.GetInterface().Get(), - inputTensor.GetInterface().Get())); + DmlOperator::Compute(kernelContext); } } }; From 759442b0f7a32a0a8ed5c81ee98193015c20428e Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 2 Aug 2023 15:17:17 -0700 Subject: [PATCH 61/76] Enable copy-less I/O binding --- onnxruntime/core/framework/utils.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index d762211f7816b..3d67e49e155d9 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -161,6 +161,14 @@ static Status BatchOrCopyMLValue(const SessionState& session_state, return Status::OK(); } +#ifdef USE_DML + // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device + if (copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU && copy_info.source_device.Id() == copy_info.target_device.Id()) { + target_mlvalue = source_mlvalue; + return Status::OK(); + } +#endif + auto allocator = session_state.GetAllocator(copy_info.target_device); if (!target_mlvalue.IsAllocated()) { ORT_ENFORCE(allocator != nullptr, "Failed to find allocator for device ", copy_info.target_device.ToString()); From 9f3e430aa220d8d69e25ed50333d027d728012f9 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 2 Aug 2023 15:17:55 -0700 Subject: [PATCH 62/76] Fix nonzero coordinates operator --- .../src/Operators/DmlOperatorNonZero.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp index 61623dfe2b4dd..c9d215d097b4e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp @@ -122,11 +122,8 @@ class DmlOperatorNonZero: public DmlOperator if (!m_emptyInput && nonzeroElementCount > 0) { - std::vector outputCoordinatesStrides = {nonzeroElementCount * 2, 2}; - TensorDesc stridedOutputTensorDesc(DML_TENSOR_DATA_TYPE_UINT32, outputSizes, outputCoordinatesStrides); - // TODO: Remove this hack when DML supports native int64 for NonZero - m_zeroOperator = InitializeZeroInt64Tensor(stridedOutputTensorDesc.GetBufferSizeInBytes()); + m_zeroOperator = InitializeZeroInt64Tensor(m_rank * nonzeroElementCount * sizeof(int64_t)); ExecuteZeroInt64Tensor(m_zeroOperator.Get(), outputTensor.GetInterface().Get()); ComPtr sliceOperator = InitializeSlice(m_intermediateTensorDescs[1], nonzeroElementCount); From 2da8999b4d57d92ec949eaf6a5f10dab1e59e5af Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 2 Aug 2023 23:11:10 -0700 Subject: [PATCH 63/76] Fix If test crash --- .../src/MLOperatorAuthorImpl.cpp | 45 +++++++++++++++++++ .../src/MLOperatorAuthorImpl.h | 6 +++ .../src/Operators/DmlOperatorCopy.cpp | 35 +++++++++++---- .../MLOperatorAuthorPrivate.h | 7 +++ 4 files changed, 84 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 320f527f7ea46..0f29409df1e2b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1343,6 +1343,12 @@ namespace Windows::AI::MachineLearning::Adapter { ORT_TRY { + if (!m_kernelContext) + { + *aliasing = false; + return S_OK; + } + auto inputData = m_kernelContext->Input(inputIndex)->DataRaw(); auto outputData = m_kernelContext->Output(outputIndex, outputShape)->DataRaw(); *aliasing = inputData == outputData; @@ -1351,6 +1357,45 @@ namespace Windows::AI::MachineLearning::Adapter ORT_CATCH_RETURN } + HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputSharesOutputBuffer( + uint32_t inputIndex, + uint32_t outputIndex, + const onnxruntime::TensorShape& outputShape, + bool* sharesOutputBuffer) noexcept + { + ORT_TRY + { + if (!m_kernelContext) + { + *sharesOutputBuffer = false; + return S_OK; + } + + auto inputTensor = const_cast(m_kernelContext->Input(inputIndex)); + auto outputTensor = m_kernelContext->Output(outputIndex, outputShape); + + // Null input or output data means that the tensors are empty (i.e. one of the dimensions is 0) + if (inputTensor->DataRaw() == nullptr || outputTensor->DataRaw() == nullptr) + { + *sharesOutputBuffer = false; + return S_OK; + } + + auto inputWrapper = wil::MakeOrThrow(inputTensor, true, m_winmlProvider.Get(), true); + auto outputWrapper = wil::MakeOrThrow(outputTensor, true, m_winmlProvider.Get(), true); + + ComPtr inputResource; + inputWrapper->GetDataInterface(inputResource.GetAddressOf()); + + ComPtr outputResource; + outputWrapper->GetDataInterface(outputResource.GetAddressOf()); + + *sharesOutputBuffer = inputResource.Get() == outputResource.Get(); + return S_OK; + } + ORT_CATCH_RETURN + } + DmlGraphOpKernelInfoWrapper::DmlGraphOpKernelInfoWrapper( const onnxruntime::OpNodeProtoHelper* protoHelper, const void* executionHandle, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index f9c0afd20f7e4..e12a4435bd747 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -411,6 +411,12 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper< const onnxruntime::TensorShape& outputShape, bool* aliasing) noexcept override; + HRESULT STDMETHODCALLTYPE InputSharesOutputBuffer( + uint32_t inputIndex, + uint32_t outputIndex, + const onnxruntime::TensorShape& outputShape, + bool* sharesOutputBuffer) noexcept override; + private: // For shape info, in addition to the info const EdgeShapes* m_inferredOutputShapes = nullptr; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index bb45cdfef25c1..c55cf60abb873 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -33,18 +33,16 @@ class DmlOperatorCopy : public DmlOperator // (e.g. squeeze/unsqueeze/reshape). An exception to this rule is when the operator is part of the graph, // in which case we always need to compile and execute the operator (although this is something that we // could optimize in the future). - - bool aliasing = false; - if (!contextPrivate->IsDmlGraphNode()) { std::vector outputSizes = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); std::vector outputSizesInt64(outputSizes.begin(), outputSizes.end()); onnxruntime::TensorShape outputShape(outputSizesInt64); - ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &aliasing)); + ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &m_aliasing)); + ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputSharesOutputBuffer(0, 0, outputShape, &m_inputSharesOutputBuffer)); } - if (!aliasing) + if (contextPrivate->IsDmlGraphNode() || (!m_aliasing && m_inputSharesOutputBuffer)) { std::vector inputDescs = GetDmlInputDescs(); std::vector outputDescs = GetDmlOutputDescs(); @@ -59,15 +57,34 @@ class DmlOperatorCopy : public DmlOperator void Compute(const MLOperatorKernelContext& kernelContext) final { - MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); - MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); + // If the input is aliasing the output, we don't need to do anything here + if (m_aliasing) + { + return; + } - // Avoid self copying. - if (inputTensor.GetByteData() != outputTensor.GetByteData()) + // If the input and the output share the same buffer, we need to do an identity operation + if (m_inputSharesOutputBuffer) { DmlOperator::Compute(kernelContext); + return; } + + // If the input and the output don't share the same buffer, we can do a standard copy operation instead + MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); + MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); + + ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( + outputTensor.GetInterface().Get(), + inputTensor.GetInterface().Get())); } + +private: + // Aliasing means that both the input and the output start at the same exact offset in the same buffer + bool m_aliasing = false; + + // The choice of using Identity or a copy depends on whether the input and the input are located in the same buffer + bool m_inputSharesOutputBuffer = false; }; DML_OP_DEFINE_CREATION_FUNCTION(Copy, DmlOperatorCopy); diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h index d86fdff8ac7e1..5640c9f30283b 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h @@ -118,6 +118,13 @@ IMLOperatorKernelCreationContextNodeWrapperPrivate : public IMLOperatorKernelCre _In_ const onnxruntime::TensorShape& outputShape, _Out_ bool* aliasing ) noexcept PURE; + + STDMETHOD(InputSharesOutputBuffer)( + _In_ uint32_t inputIndex, + _In_ uint32_t outputIndex, + _In_ const onnxruntime::TensorShape& outputShape, + _Out_ bool* sharesBuffer + ) noexcept PURE; }; //! \interface IMLOperatorAttributes1 From 26a94e17f96a2ed5629426a8f03a6a8ac3a275d4 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 3 Aug 2023 10:41:36 -0700 Subject: [PATCH 64/76] Fix output binding crash --- onnxruntime/core/framework/utils.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 3d67e49e155d9..9ad0bbdac2305 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -162,8 +162,12 @@ static Status BatchOrCopyMLValue(const SessionState& session_state, } #ifdef USE_DML + const bool bothValuesOnGPU = copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU; + const bool targetIsInternalAlloc = copy_info.target_device.MemType() == OrtDevice::MemType::DEFAULT; + const bool bothValuesOnSameDevice = copy_info.source_device.Id() == copy_info.target_device.Id(); + // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device - if (copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU && copy_info.source_device.Id() == copy_info.target_device.Id()) { + if (bothValuesOnGPU && targetIsInternalAlloc && bothValuesOnSameDevice) { target_mlvalue = source_mlvalue; return Status::OK(); } From 31270e6fd594a03016aa184b4437848e38d743fe Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 3 Aug 2023 20:48:08 -0700 Subject: [PATCH 65/76] Fix test failures --- .../src/MLOperatorAuthorImpl.cpp | 61 ------------------- .../src/MLOperatorAuthorImpl.h | 12 ---- .../src/Operators/DmlOperatorCopy.cpp | 59 ++++++++---------- .../MLOperatorAuthorPrivate.h | 14 ----- 4 files changed, 24 insertions(+), 122 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 0f29409df1e2b..4b749acf4ae33 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1335,67 +1335,6 @@ namespace Windows::AI::MachineLearning::Adapter return m_allowOutputShapeQuery; } - HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputAliasesOutput( - uint32_t inputIndex, - uint32_t outputIndex, - const onnxruntime::TensorShape& outputShape, - bool* aliasing) noexcept - { - ORT_TRY - { - if (!m_kernelContext) - { - *aliasing = false; - return S_OK; - } - - auto inputData = m_kernelContext->Input(inputIndex)->DataRaw(); - auto outputData = m_kernelContext->Output(outputIndex, outputShape)->DataRaw(); - *aliasing = inputData == outputData; - return S_OK; - } - ORT_CATCH_RETURN - } - - HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputSharesOutputBuffer( - uint32_t inputIndex, - uint32_t outputIndex, - const onnxruntime::TensorShape& outputShape, - bool* sharesOutputBuffer) noexcept - { - ORT_TRY - { - if (!m_kernelContext) - { - *sharesOutputBuffer = false; - return S_OK; - } - - auto inputTensor = const_cast(m_kernelContext->Input(inputIndex)); - auto outputTensor = m_kernelContext->Output(outputIndex, outputShape); - - // Null input or output data means that the tensors are empty (i.e. one of the dimensions is 0) - if (inputTensor->DataRaw() == nullptr || outputTensor->DataRaw() == nullptr) - { - *sharesOutputBuffer = false; - return S_OK; - } - - auto inputWrapper = wil::MakeOrThrow(inputTensor, true, m_winmlProvider.Get(), true); - auto outputWrapper = wil::MakeOrThrow(outputTensor, true, m_winmlProvider.Get(), true); - - ComPtr inputResource; - inputWrapper->GetDataInterface(inputResource.GetAddressOf()); - - ComPtr outputResource; - outputWrapper->GetDataInterface(outputResource.GetAddressOf()); - - *sharesOutputBuffer = inputResource.Get() == outputResource.Get(); - return S_OK; - } - ORT_CATCH_RETURN - } - DmlGraphOpKernelInfoWrapper::DmlGraphOpKernelInfoWrapper( const onnxruntime::OpNodeProtoHelper* protoHelper, const void* executionHandle, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index e12a4435bd747..4f982c80c4c5c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -405,18 +405,6 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper< return m_winmlProvider.CopyTo(executionProvider); } - HRESULT STDMETHODCALLTYPE InputAliasesOutput( - uint32_t inputIndex, - uint32_t outputIndex, - const onnxruntime::TensorShape& outputShape, - bool* aliasing) noexcept override; - - HRESULT STDMETHODCALLTYPE InputSharesOutputBuffer( - uint32_t inputIndex, - uint32_t outputIndex, - const onnxruntime::TensorShape& outputShape, - bool* sharesOutputBuffer) noexcept override; - private: // For shape info, in addition to the info const EdgeShapes* m_inferredOutputShapes = nullptr; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index c55cf60abb873..8fa3c74674776 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -29,54 +29,43 @@ class DmlOperatorCopy : public DmlOperator ComPtr contextPrivate; ORT_THROW_IF_FAILED(kernelInfo.GetInterface()->QueryInterface(contextPrivate.GetAddressOf())); - // We don't need to compile any operator if the input aliases the output as it is essentially a no-op - // (e.g. squeeze/unsqueeze/reshape). An exception to this rule is when the operator is part of the graph, - // in which case we always need to compile and execute the operator (although this is something that we - // could optimize in the future). - if (!contextPrivate->IsDmlGraphNode()) - { - std::vector outputSizes = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); - std::vector outputSizesInt64(outputSizes.begin(), outputSizes.end()); - onnxruntime::TensorShape outputShape(outputSizesInt64); - ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &m_aliasing)); - ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputSharesOutputBuffer(0, 0, outputShape, &m_inputSharesOutputBuffer)); - } - - if (contextPrivate->IsDmlGraphNode() || (!m_aliasing && m_inputSharesOutputBuffer)) - { - std::vector inputDescs = GetDmlInputDescs(); - std::vector outputDescs = GetDmlOutputDescs(); + // Although we always compile the operator because we don't know where the memory will be allocated in the future, + // we may not always end up executing it. + std::vector inputDescs = GetDmlInputDescs(); + std::vector outputDescs = GetDmlOutputDescs(); - DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {}; - opDesc.InputTensor = inputDescs.data(); - opDesc.OutputTensor = outputDescs.data(); + DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {}; + opDesc.InputTensor = inputDescs.data(); + opDesc.OutputTensor = outputDescs.data(); - SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo); - } + SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo); } void Compute(const MLOperatorKernelContext& kernelContext) final { - // If the input is aliasing the output, we don't need to do anything here - if (m_aliasing) + MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); + MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); + + // If the input is aliasing the output (i.e. they share the same resource at the same offset), + // we don't need to do anything. This is essentially a no-op. + if (inputTensor.GetByteData() == outputTensor.GetByteData()) { return; } - // If the input and the output share the same buffer, we need to do an identity operation - if (m_inputSharesOutputBuffer) + // If the input is not aliasing the output but shares the same resource, we have to use an Identity operation + // because the resource cannot simultaneously be in both the COPY_SOURCE and COPY_DEST states. + if (inputTensor.GetDataInterface().Get() == outputTensor.GetDataInterface().Get()) { DmlOperator::Compute(kernelContext); - return; } - - // If the input and the output don't share the same buffer, we can do a standard copy operation instead - MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0); - MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0); - - ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( - outputTensor.GetInterface().Get(), - inputTensor.GetInterface().Get())); + else + { + // The input and the output don't share the same resource, so we can do a simple copy. + ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor( + outputTensor.GetInterface().Get(), + inputTensor.GetInterface().Get())); + } } private: diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h index 5640c9f30283b..9b4536b6218b2 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h @@ -111,20 +111,6 @@ IMLOperatorKernelCreationContextNodeWrapperPrivate : public IMLOperatorKernelCre STDMETHOD(GetExecutionProvider)( _Outptr_result_maybenull_ IUnknown** executionProvider ) const noexcept PURE; - - STDMETHOD(InputAliasesOutput)( - _In_ uint32_t inputIndex, - _In_ uint32_t outputIndex, - _In_ const onnxruntime::TensorShape& outputShape, - _Out_ bool* aliasing - ) noexcept PURE; - - STDMETHOD(InputSharesOutputBuffer)( - _In_ uint32_t inputIndex, - _In_ uint32_t outputIndex, - _In_ const onnxruntime::TensorShape& outputShape, - _Out_ bool* sharesBuffer - ) noexcept PURE; }; //! \interface IMLOperatorAttributes1 From 738efb7dc3b5804ef43ec65faa832585cc5b4234 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Fri, 4 Aug 2023 19:07:58 -0700 Subject: [PATCH 66/76] Fix upload heap regression --- .../src/PooledUploadHeap.cpp | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp index 442b3e7ddf746..4a222d183bcfd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp @@ -118,8 +118,23 @@ namespace Dml std::pair PooledUploadHeap::Reserve(size_t sizeInBytes) { + // Try to find a chunk with enough free space to accommodate the requested allocation size + for (Chunk& chunk : m_chunks) + { + std::optional offsetForAllocation = FindOffsetForAllocation(chunk, sizeInBytes); + if (offsetForAllocation) + { + // There's enough space in this chunk - return + return std::make_pair(&chunk, *offsetForAllocation); + } + } + + // No chunks were able to accommodate the allocation - create a new chunk and return that instead + // At least double the capacity of the pool - m_chunks.push_back(CreateChunk(m_device.Get(), sizeInBytes)); + const size_t newChunkSize = std::max({ m_totalCapacity, c_minChunkSize, sizeInBytes }); + m_chunks.push_back(CreateChunk(m_device.Get(), newChunkSize)); + m_totalCapacity += newChunkSize; // Allocate from the beginning of the new chunk return std::make_pair(&m_chunks.back(), 0); @@ -197,6 +212,13 @@ namespace Dml return c.allocations.empty(); }); m_chunks.erase(it, m_chunks.end()); + + // Re-calculate total capacity + m_totalCapacity = 0; + for (const auto& chunk : m_chunks) + { + m_totalCapacity += chunk.capacityInBytes; + } } void PooledUploadHeap::AssertInvariants() @@ -208,7 +230,7 @@ namespace Dml }; // Chunks should be sorted by ascending capacity - // assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer)); + assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer)); // Allocations in a chunk should be sorted by ascending fence value for (const auto& chunk : m_chunks) @@ -254,6 +276,14 @@ namespace Dml } } + // Validate total capacity of pool + size_t calculatedCapacity = 0; + for (const auto& chunk : m_chunks) + { + calculatedCapacity += chunk.capacityInBytes; + } + assert(calculatedCapacity == m_totalCapacity); + #endif // #ifdef _DEBUG } } // namespace Dml From f64ed2b5082ff21dfe7246f9055dded0c06a6df0 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Sun, 6 Aug 2023 14:03:09 -0700 Subject: [PATCH 67/76] Address PR comments --- onnxruntime/core/framework/utils.cc | 3 +- .../inc/DmlExecutionProvider.h | 2 +- .../src/BucketizedBufferAllocator.cpp | 14 +- .../src/BucketizedBufferAllocator.h | 4 +- .../src/DmlBfcAllocator.h | 2 +- .../DmlExecutionProvider/src/DmlBuffer.cpp | 26 +- .../dml/DmlExecutionProvider/src/DmlBuffer.h | 11 +- .../src/DmlBufferRegion.cpp | 30 +-- .../src/DmlBufferRegion.h | 19 +- .../src/DmlExternalGpuAllocator.cpp | 4 +- .../src/DmlExternalGpuAllocator.h | 2 +- .../src/DmlGpuAllocator.cpp | 12 +- .../src/DmlGpuAllocator.h | 4 +- .../src/DmlHeapAllocation.h | 2 +- .../src/DmlReservedResourceSubAllocator.cpp | 226 ++++++++---------- .../src/DmlReservedResourceSubAllocator.h | 44 ++-- .../src/DmlReservedResourceWrapper.h | 2 +- .../src/DmlTaggedPointer.cpp | 34 ++- .../src/DmlTaggedPointer.h | 25 +- 19 files changed, 220 insertions(+), 246 deletions(-) diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 9ad0bbdac2305..7e11cba608f4d 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -163,11 +163,12 @@ static Status BatchOrCopyMLValue(const SessionState& session_state, #ifdef USE_DML const bool bothValuesOnGPU = copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU; + const bool sourceIsDmlAlloc = copy_info.source_device.MemType() == OrtDevice::MemType::DEFAULT || copy_info.source_device.MemType() == OrtDevice::MemType::DML_EXTERNAL; const bool targetIsInternalAlloc = copy_info.target_device.MemType() == OrtDevice::MemType::DEFAULT; const bool bothValuesOnSameDevice = copy_info.source_device.Id() == copy_info.target_device.Id(); // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device - if (bothValuesOnGPU && targetIsInternalAlloc && bothValuesOnSameDevice) { + if (bothValuesOnGPU && sourceIsDmlAlloc && targetIsInternalAlloc && bothValuesOnSameDevice) { target_mlvalue = source_mlvalue; return Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index 755bf60195e2e..9ecfec4139756 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -32,7 +32,7 @@ namespace Dml bool enableMetacommands, bool enableBfcAllocator); - D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t size_in_bytes); + D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 675f17e4c28af..f8851c1b87a4f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -58,16 +58,16 @@ namespace Dml gsl::index index = static_cast(ceil(log2(size))); assert((1ull << index) >= size); // This must be true unless there were some strange rounding issues - // The smallest bucket is 2^n bytes large, where n = c_minResourceSizeExponent - index = std::max(index, c_minResourceSizeExponent); - index -= c_minResourceSizeExponent; + // The smallest bucket is 2^n bytes large, where n = MinResourceSizeExponent + index = std::max(index, MinResourceSizeExponent); + index -= MinResourceSizeExponent; return index; } /*static*/ uint64_t BucketizedBufferAllocator::GetBucketSizeFromIndex(gsl::index index) { - return (1ull << (index + c_minResourceSizeExponent)); + return (1ull << (index + MinResourceSizeExponent)); } ComPtr BucketizedBufferAllocator::AllocCommittedResource(size_t size) @@ -93,15 +93,15 @@ namespace Dml return static_cast(opaquePointer); } - D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const + D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes) const { auto allocationInfo = static_cast(opaquePointer); // Make sure that we are aligned to 4 bytes to satisfy DML's requirements constexpr uint64_t DML_ALIGNMENT = 4; - size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; + sizeInBytes = (1 + (sizeInBytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; - return D3D12BufferRegion(0, size_in_bytes, allocationInfo->GetD3D12Resource()); + return D3D12BufferRegion(0, sizeInBytes, allocationInfo->GetD3D12Resource()); } void* BucketizedBufferAllocator::Alloc(size_t size) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 899c8dd44182d..d0b905c45c3c7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -37,7 +37,7 @@ namespace Dml void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); AllocationInfo* GetAllocationInfo(void* opaquePointer); - D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const; + D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes) const; uint64_t GetUniqueId(void* opaquePointer); public: // onnxruntime::IAllocator @@ -45,7 +45,7 @@ namespace Dml void Free(void* p) final; private: - static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB + static const uint32_t MinResourceSizeExponent = 16; // 2^16 = 64KB // The pool consists of a number of buckets, and each bucket contains a number of resources of the same size. // The resources in each bucket are always sized as a power of two, and each bucket contains resources twice diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h index 43e093538fcb6..d8631c1e9c1d0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h @@ -21,7 +21,7 @@ namespace Dml ), m_subAllocator(std::move(subAllocator)) {} - void* Alloc(size_t size_in_bytes) final { return m_subAllocator->Alloc(size_in_bytes); } + void* Alloc(size_t sizeInBytes) final { return m_subAllocator->Alloc(sizeInBytes); } void Free(void* ptr) final { m_subAllocator->Free(ptr); } private: std::shared_ptr m_subAllocator; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp index 21b5da96ce236..298227b54d947 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp @@ -8,58 +8,58 @@ namespace Dml { -/*explicit*/ DmlBuffer::DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes) - : allocator_(allocator) +/*explicit*/ DmlBuffer::DmlBuffer(DmlGpuAllocator* allocator, uint64_t sizeInBytes) + : m_allocator(allocator) { - m_opaqueData = allocator_->Alloc(size_in_bytes); + m_opaqueData = m_allocator->Alloc(sizeInBytes); ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr); - buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes); + m_bufferRegion = m_allocator->CreateBufferRegion(m_opaqueData, sizeInBytes); } DmlBuffer::~DmlBuffer() { if (m_opaqueData != nullptr) { - allocator_->Free(m_opaqueData); + m_allocator->Free(m_opaqueData); } } DmlBuffer::DmlBuffer(DmlBuffer&& other) noexcept { m_opaqueData = other.m_opaqueData; - allocator_ = other.allocator_; - buffer_region_ = std::move(other.buffer_region_); + m_allocator = other.m_allocator; + m_bufferRegion = std::move(other.m_bufferRegion); other.m_opaqueData = nullptr; } DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) noexcept { m_opaqueData = other.m_opaqueData; - allocator_ = other.allocator_; - buffer_region_ = std::move(other.buffer_region_); + m_allocator = other.m_allocator; + m_bufferRegion = std::move(other.m_bufferRegion); other.m_opaqueData = nullptr; return *this; } ID3D12Resource* DmlBuffer::GetD3D12Resource() const { - return buffer_region_.GetD3D12Resource(); + return m_bufferRegion.GetD3D12Resource(); } uint64_t DmlBuffer::Offset() const { - return buffer_region_ ? buffer_region_.Offset() : 0; + return m_bufferRegion ? m_bufferRegion.Offset() : 0; } uint64_t DmlBuffer::SizeInBytes() const { - return buffer_region_ ? buffer_region_.SizeInBytes() : 0; + return m_bufferRegion ? m_bufferRegion.SizeInBytes() : 0; } DML_BUFFER_BINDING DmlBuffer::GetBufferBinding() const { - return buffer_region_ ? buffer_region_.GetBufferBinding() + return m_bufferRegion ? m_bufferRegion.GetBufferBinding() : DML_BUFFER_BINDING{}; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h index 019d186441da5..e7b570d365a62 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h @@ -18,7 +18,7 @@ class OpKernelContext; class DmlBuffer { public: - explicit DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes); + explicit DmlBuffer(DmlGpuAllocator* allocator, uint64_t sizeInBytes); ~DmlBuffer(); // Move-only @@ -30,15 +30,14 @@ class DmlBuffer ID3D12Resource* GetD3D12Resource() const; uint64_t Offset() const; uint64_t SizeInBytes() const; - const D3D12BufferRegion& Region() const { return buffer_region_; } - + const D3D12BufferRegion& Region() const { return m_bufferRegion; } DML_BUFFER_BINDING GetBufferBinding() const; - explicit operator bool() const { return !!buffer_region_; } + explicit operator bool() const { return !!m_bufferRegion; } private: - DmlGpuAllocator* allocator_; - D3D12BufferRegion buffer_region_; + DmlGpuAllocator* m_allocator; + D3D12BufferRegion m_bufferRegion; void* m_opaqueData; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp index 57c4d5b342bb8..627e383a17195 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp @@ -7,38 +7,38 @@ namespace Dml { - D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource) + D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t sizeInBytes, ID3D12Resource* resource) : m_resource(resource), - offset_(offset), - size_in_bytes_(size_in_bytes) + m_offset(offset), + m_sizeInBytes(sizeInBytes) { ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr); // Regions cannot be empty. - ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0); + ORT_THROW_HR_IF(E_INVALIDARG, m_sizeInBytes == 0); // Regions cannot extend beyond the size of the resource. - uint64_t buffer_size = m_resource->GetDesc().Width; - ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size); - ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset); + uint64_t bufferSize = m_resource->GetDesc().Width; + ORT_THROW_HR_IF(E_INVALIDARG, m_offset >= bufferSize); + ORT_THROW_HR_IF(E_INVALIDARG, m_sizeInBytes > bufferSize - offset); // All three resources, if provided, must be identical aside from state. assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER); - assert(m_resource->GetDesc().Width == buffer_size); + assert(m_resource->GetDesc().Width == bufferSize); } D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) noexcept { std::swap(this->m_resource, that.m_resource); - std::swap(this->offset_, that.offset_); - std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->m_offset, that.m_offset); + std::swap(this->m_sizeInBytes, that.m_sizeInBytes); } D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) noexcept { std::swap(this->m_resource, that.m_resource); - std::swap(this->offset_, that.offset_); - std::swap(this->size_in_bytes_, that.size_in_bytes_); + std::swap(this->m_offset, that.m_offset); + std::swap(this->m_sizeInBytes, that.m_sizeInBytes); return *this; } @@ -49,12 +49,12 @@ namespace Dml uint64_t D3D12BufferRegion::Offset() const { - return m_resource ? offset_ : 0; + return m_resource ? m_offset : 0; } uint64_t D3D12BufferRegion::SizeInBytes() const { - return m_resource ? size_in_bytes_ : 0; + return m_resource ? m_sizeInBytes : 0; } DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const @@ -64,7 +64,7 @@ namespace Dml return DML_BUFFER_BINDING{}; } - return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_}; + return DML_BUFFER_BINDING{m_resource, m_offset, m_sizeInBytes}; } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h index 40c41f980b011..d14ff1b51b3f9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h @@ -16,10 +16,7 @@ namespace Dml // References a region of a buffer. The respective ID3D12Resource objects // must be in the appropriate states. Each resource is optional, but if more // than one are provided they must map to the same region of memory. - D3D12BufferRegion( - uint64_t offset, - uint64_t size_in_bytes, - ID3D12Resource* resource); + D3D12BufferRegion(uint64_t offset, uint64_t sizeInBytes, ID3D12Resource* resource); // Move-only D3D12BufferRegion(const D3D12BufferRegion&) = default; @@ -37,21 +34,21 @@ namespace Dml // Creates a subregion at an offset from the start of this region. If no // size is provided the region runs to the end of the current region. - inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const + inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t sizeInBytes = 0) const { // start of subregion must be within current region - ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_); - size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes; + ORT_THROW_HR_IF(E_INVALIDARG, offset >= m_sizeInBytes); + sizeInBytes = sizeInBytes == 0 ? m_sizeInBytes - offset : sizeInBytes; // end of subregion must be within current region - ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset); + ORT_THROW_HR_IF(E_INVALIDARG, sizeInBytes > m_sizeInBytes - offset); - return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource); + return D3D12BufferRegion(m_offset + offset, sizeInBytes, m_resource); } private: ID3D12Resource* m_resource = nullptr; - uint64_t offset_ = 0; - uint64_t size_in_bytes_ = 0; + uint64_t m_offset = 0; + uint64_t m_sizeInBytes = 0; }; } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp index 0cb8e36581672..3882823629854 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp @@ -36,10 +36,10 @@ namespace Dml m_device = onnxruntime::DMLProviderFactoryCreator::CreateD3D12Device(device_id, false); } - void* DmlExternalGpuAllocator::Alloc(size_t size_in_bytes) + void* DmlExternalGpuAllocator::Alloc(size_t sizeInBytes) { Microsoft::WRL::ComPtr resource; - auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + auto buffer = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); auto props = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT); ORT_THROW_IF_FAILED(m_device->CreateCommittedResource( &props, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h index 7ac1cc9510b10..3d61bee211949 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h @@ -17,7 +17,7 @@ namespace Dml DmlExternalGpuAllocator(ID3D12Device* device); DmlExternalGpuAllocator(int device_id); - void* Alloc(size_t size_in_bytes) final; + void* Alloc(size_t sizeInBytes) final; void Free(void* ptr) final; private: diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp index 881478c3e874f..b0ddbd2b155ff 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp @@ -31,14 +31,14 @@ namespace Dml m_bfcSubAllocator(bfcSubAllocator), m_activeAllocator(activeAllocator) {} - void* DmlGpuAllocator::Alloc(size_t size_in_bytes) + void* DmlGpuAllocator::Alloc(size_t sizeInBytes) { switch(m_activeAllocator) { case ActiveAllocator::BfcAllocator: - return m_bfcAllocator->Alloc(size_in_bytes); + return m_bfcAllocator->Alloc(sizeInBytes); case ActiveAllocator::BucketizedBufferAllocator: - return m_bucketizedBufferAllocator->Alloc(size_in_bytes); + return m_bucketizedBufferAllocator->Alloc(sizeInBytes); default: ORT_THROW_HR(E_UNEXPECTED); } @@ -57,14 +57,14 @@ namespace Dml } } - D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) + D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes) { switch(m_activeAllocator) { case ActiveAllocator::BfcAllocator: - return m_bfcSubAllocator->CreateBufferRegion(opaquePointer, size_in_bytes); + return m_bfcSubAllocator->CreateBufferRegion(opaquePointer, sizeInBytes); case ActiveAllocator::BucketizedBufferAllocator: - return m_bucketizedBufferAllocator->CreateBufferRegion(opaquePointer, size_in_bytes); + return m_bucketizedBufferAllocator->CreateBufferRegion(opaquePointer, sizeInBytes); default: ORT_THROW_HR(E_UNEXPECTED); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h index e8b020a85767b..dda5f1984da69 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h @@ -29,9 +29,9 @@ namespace Dml std::shared_ptr bfcSubAllocator, ActiveAllocator activeAllocator); - void* Alloc(size_t size_in_bytes) final; + void* Alloc(size_t sizeInBytes) final; void Free(void* ptr) final; - D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes); AllocationInfo* GetAllocationInfo(void* opaquePointer); void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode); DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h index ab75b7d322120..5ecf135a9ee43 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h @@ -13,6 +13,6 @@ namespace Dml // an allocation may comprise multiple heaps. If tiling is not supported // an allocation will only have a single heap. std::vector> heaps; - Microsoft::WRL::ComPtr resource_uav_state; + Microsoft::WRL::ComPtr resourceUavState; }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp index 4fdd6411d555a..cb58c30283e95 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp @@ -33,16 +33,16 @@ namespace Dml gsl::index index = static_cast(ceil(log2(size))); assert((1ull << index) >= size); // This must be true unless there were some strange rounding issues - // The smallest bucket is 2^n bytes large, where n = c_minResourceSizeExponent - index = std::max(index, c_minResourceSizeExponent); - index -= c_minResourceSizeExponent; + // The smallest bucket is 2^n bytes large, where n = MinResourceSizeExponent + index = std::max(index, MinResourceSizeExponent); + index -= MinResourceSizeExponent; return index; } /*static*/ uint64_t DmlReservedResourceSubAllocator::GetBucketSizeFromIndex(gsl::index index) { - return (1ull << (index + c_minResourceSizeExponent)); + return (1ull << (index + MinResourceSizeExponent)); } void DmlReservedResourceSubAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) @@ -53,10 +53,7 @@ namespace Dml static bool GetTilingEnabled(ID3D12Device* device) { D3D12_FEATURE_DATA_D3D12_OPTIONS options = {}; - if (SUCCEEDED(device->CheckFeatureSupport( - D3D12_FEATURE_D3D12_OPTIONS, - &options, - sizeof(options)))) + if (SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &options, sizeof(options)))) { return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1; } @@ -66,91 +63,86 @@ namespace Dml static uint64_t GetMaxHeapSizeInTiles() { - return DmlReservedResourceSubAllocator::kDefaultMaxHeapSizeInTiles; + return DmlReservedResourceSubAllocator::DefaultMaxHeapSizeInTiles; } DmlReservedResourceSubAllocator::DmlReservedResourceSubAllocator( ID3D12Device* device, std::shared_ptr context, ID3D12CommandQueue* queue, - const D3D12_HEAP_PROPERTIES& heap_props, - D3D12_HEAP_FLAGS heap_flags, - D3D12_RESOURCE_FLAGS resource_flags, - D3D12_RESOURCE_STATES initial_state) + const D3D12_HEAP_PROPERTIES& heapProps, + D3D12_HEAP_FLAGS heapFlags, + D3D12_RESOURCE_FLAGS resourceFlags, + D3D12_RESOURCE_STATES initialState) : m_device(device), m_context(context), - queue_(queue), - heap_properties_(heap_props), - heap_flags_(heap_flags), - resource_flags_(resource_flags), - initial_state_(initial_state), - tiling_enabled_(GetTilingEnabled(device)), - max_heap_size_in_tiles_(GetMaxHeapSizeInTiles()) + m_queue(queue), + m_heapProperties(heapProps), + m_heapFlags(heapFlags), + m_resourceFlags(resourceFlags), + m_initialState(initialState), + m_tilingEnabled(GetTilingEnabled(device)), + m_maxHeapSizeInTiles(GetMaxHeapSizeInTiles()) { } - absl::optional DmlReservedResourceSubAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes) + absl::optional DmlReservedResourceSubAllocator::TryCreateTiledAllocation(uint64_t sizeInBytes) { DmlHeapAllocation allocation = {}; // The allocation may be larger than the requested size to ensure a whole // number of tiles. - const uint64_t resource_size_in_tiles = 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - const uint64_t resource_size_in_bytes = resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - auto resource_desc = CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_); + const uint64_t resourceSizeInTiles = 1 + (sizeInBytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resourceSizeInBytes = resourceSizeInTiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + auto resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(resourceSizeInBytes, m_resourceFlags); - HRESULT create_resource_hr = m_device->CreateReservedResource( - &resource_desc, - initial_state_, + HRESULT createResourceHr = m_device->CreateReservedResource( + &resourceDesc, + m_initialState, nullptr, - IID_PPV_ARGS(&allocation.resource_uav_state)); + IID_PPV_ARGS(&allocation.resourceUavState)); - if (create_resource_hr == E_OUTOFMEMORY) + if (createResourceHr == E_OUTOFMEMORY) { return absl::nullopt; } - ORT_THROW_IF_FAILED(create_resource_hr); + ORT_THROW_IF_FAILED(createResourceHr); // Reserve enough heaps to store all tiles in the resource. - const uint64_t heap_count = 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_; - allocation.heaps.resize(heap_count); + const uint64_t heapCount = 1 + (resourceSizeInTiles - 1) / m_maxHeapSizeInTiles; + allocation.heaps.resize(heapCount); // Create heaps and map them to the primary reserved resource. - D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {}; - uint64_t unmapped_resource_tiles = resource_size_in_tiles; - for (uint64_t i = 0; i < heap_count; i++) + D3D12_TILED_RESOURCE_COORDINATE resourceRegionStartCoordinates = {}; + uint64_t unmappedResourceTiles = resourceSizeInTiles; + for (uint64_t i = 0; i < heapCount; i++) { // Create heap. The last heap of the allocation may have fewer tiles to // avoid wasting space. - uint64_t heap_size_in_tiles = std::min( - unmapped_resource_tiles, - max_heap_size_in_tiles_); - uint64_t heap_size_in_bytes = - heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + uint64_t heapSizeInTiles = std::min(unmappedResourceTiles, m_maxHeapSizeInTiles); + uint64_t heapSizeInBytes = heapSizeInTiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; auto heap_desc = CD3DX12_HEAP_DESC( - heap_size_in_bytes, - heap_properties_, + heapSizeInBytes, + m_heapProperties, 0, - heap_flags_); + m_heapFlags); - HRESULT create_heap_hr = - m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); - if (create_heap_hr == E_OUTOFMEMORY) + HRESULT createHeapHr = m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i])); + if (createHeapHr == E_OUTOFMEMORY) { return absl::nullopt; } - ORT_THROW_IF_FAILED(create_heap_hr); + ORT_THROW_IF_FAILED(createHeapHr); // Source region in the resource to map. - D3D12_TILE_REGION_SIZE resource_region_size = {}; - resource_region_size.NumTiles = static_cast(heap_size_in_tiles); + D3D12_TILE_REGION_SIZE resourceRegionSize = {}; + resourceRegionSize.NumTiles = static_cast(heapSizeInTiles); // Target range in the current heap to map. - const D3D12_TILE_RANGE_FLAGS tile_range_flags = - D3D12_TILE_RANGE_FLAG_NONE; - const uint32_t heap_range_tile_count = static_cast(heap_size_in_tiles); + constexpr D3D12_TILE_RANGE_FLAGS tileRangeFlags = D3D12_TILE_RANGE_FLAG_NONE; + const uint32_t heapRangeTileCount = static_cast(heapSizeInTiles); - constexpr uint32_t heap_range_start_offset = 0; + constexpr uint32_t heapRangeStartOffset = 0; constexpr uint32_t numResourceRegions = 1; constexpr uint32_t numHeapRanges = 1; @@ -158,88 +150,83 @@ namespace Dml // guaranteed to be set (on the GPU timeline) by the time any code can // reference the returned resource. We only execute operations on a // single hardware queue so there is no need to wait or signal. - queue_->UpdateTileMappings( - allocation.resource_uav_state.Get(), + m_queue->UpdateTileMappings( + allocation.resourceUavState.Get(), numResourceRegions, - &resource_region_start_coordinates, - &resource_region_size, + &resourceRegionStartCoordinates, + &resourceRegionSize, allocation.heaps[i].Get(), numHeapRanges, - &tile_range_flags, - &heap_range_start_offset, - &heap_range_tile_count, + &tileRangeFlags, + &heapRangeStartOffset, + &heapRangeTileCount, D3D12_TILE_MAPPING_FLAG_NONE); - resource_region_start_coordinates.X += static_cast(heap_size_in_tiles); - unmapped_resource_tiles -= heap_size_in_tiles; + resourceRegionStartCoordinates.X += static_cast(heapSizeInTiles); + unmappedResourceTiles -= heapSizeInTiles; } - assert(unmapped_resource_tiles == 0); + assert(unmappedResourceTiles == 0); return allocation; } - absl::optional DmlReservedResourceSubAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes) + absl::optional DmlReservedResourceSubAllocator::TryCreateUntiledAllocation(uint64_t sizeInBytes) { DmlHeapAllocation allocation = {}; // Create the allocation's sole heap. The allocation may be larger than the // requested size to ensure a whole number of tiles. allocation.heaps.resize(1); - D3D12_HEAP_DESC heap_desc = - CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_); - HRESULT create_heap_hr = m_device->CreateHeap( - &heap_desc, - IID_PPV_ARGS(&allocation.heaps.front())); - if (create_heap_hr == E_OUTOFMEMORY) + D3D12_HEAP_DESC heap_desc = CD3DX12_HEAP_DESC(sizeInBytes, m_heapProperties, 0, m_heapFlags); + HRESULT createHeapHr = m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps.front())); + if (createHeapHr == E_OUTOFMEMORY) { return absl::nullopt; } + ORT_THROW_IF_FAILED(createHeapHr); // Create large placed resource that spans the heap. - D3D12_RESOURCE_DESC resource_desc = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_); + D3D12_RESOURCE_DESC resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, m_resourceFlags); - HRESULT create_resource_hr = m_device->CreatePlacedResource( + HRESULT createResourceHr = m_device->CreatePlacedResource( allocation.heaps.front().Get(), 0, - &resource_desc, - initial_state_, + &resourceDesc, + m_initialState, nullptr, - IID_PPV_ARGS(&allocation.resource_uav_state)); - if (create_resource_hr == E_OUTOFMEMORY) + IID_PPV_ARGS(&allocation.resourceUavState)); + if (createResourceHr == E_OUTOFMEMORY) { return absl::nullopt; } - ORT_THROW_IF_FAILED(create_resource_hr); + ORT_THROW_IF_FAILED(createResourceHr); return allocation; } uint64_t DmlReservedResourceSubAllocator::ComputeRequiredSize(size_t size) { - const uint64_t resource_size_in_tiles = - 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - const uint64_t resource_size_in_bytes = - resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - - return resource_size_in_bytes; + const uint64_t resourceSizeInTiles = 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + const uint64_t resourceSizeInBytes = resourceSizeInTiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + return resourceSizeInBytes; } - void* DmlReservedResourceSubAllocator::Alloc(size_t size_in_bytes) + void* DmlReservedResourceSubAllocator::Alloc(size_t sizeInBytes) { // For some reason lotus likes requesting 0 bytes of memory - size_in_bytes = std::max(1, size_in_bytes); + sizeInBytes = std::max(1, sizeInBytes); // The D3D12 device is thread-safe so we don't need to hold the lock while // creating an allocation. absl::optional allocation = - tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes) - : TryCreateUntiledAllocation(size_in_bytes); + m_tilingEnabled ? TryCreateTiledAllocation(sizeInBytes) + : TryCreateUntiledAllocation(sizeInBytes); ORT_THROW_HR_IF(E_INVALIDARG, !allocation); // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); + std::unique_lock lock(m_mutex); absl::optional allocationId = TryReserveAllocationID(); ORT_THROW_HR_IF(E_INVALIDARG, !allocationId); @@ -247,13 +234,13 @@ namespace Dml auto resourceWrapper = wil::MakeOrThrow(std::move(*allocation)); ComPtr allocInfo = wil::MakeOrThrow( this, - ++m_currentAllocationId, + ++m_currentUniqueAllocationId, 0, resourceWrapper.Get(), - size_in_bytes + sizeInBytes ); - allocations_by_id_.emplace(*allocationId, allocInfo); + m_allocationsById.emplace(*allocationId, allocInfo); lock.unlock(); @@ -262,28 +249,28 @@ namespace Dml #endif // DML only has a single device in ORT at the moment - constexpr uint64_t device_id = 0; + constexpr uint64_t deviceId = 0; constexpr uint64_t offset = 0; - return TaggedPointer::Pack(device_id, *allocationId, offset); + return TaggedPointer::Pack(deviceId, *allocationId, offset); } void DmlReservedResourceSubAllocator::Free(void* ptr) { ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr); - TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr); - ORT_THROW_HR_IF(E_INVALIDARG, tagged_ptr.offset != 0); + TaggedPointer taggedPtr = TaggedPointer::Unpack(ptr); + ORT_THROW_HR_IF(E_INVALIDARG, taggedPtr.offset != 0); // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); + std::unique_lock lock(m_mutex); - auto it = allocations_by_id_.find(tagged_ptr.allocation_id); - ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); + auto it = m_allocationsById.find(taggedPtr.allocationId); + ORT_THROW_HR_IF(E_INVALIDARG, it == m_allocationsById.end()); - ReleaseAllocationID(tagged_ptr.allocation_id); + ReleaseAllocationID(taggedPtr.allocationId); // Frees the ID3D12Heap - allocations_by_id_.erase(it); + m_allocationsById.erase(it); } uint64_t DmlReservedResourceSubAllocator::GetUniqueId(void* opaquePointer) @@ -313,60 +300,59 @@ namespace Dml absl::optional DmlReservedResourceSubAllocator::TryReserveAllocationID() { // The mutex must already be held - assert(!mutex_.try_lock()); + assert(!m_mutex.try_lock()); - if (!free_allocation_ids_.empty()) + if (!m_freeAllocationIds.empty()) { // Return a free ID from the pool - uint32_t id = free_allocation_ids_.back(); - free_allocation_ids_.pop_back(); + uint32_t id = m_freeAllocationIds.back(); + m_freeAllocationIds.pop_back(); return id; } - static constexpr uint32_t kMaxAllocationID = - (1 << TaggedPointer::kAllocationIDBits) - 1; - if (current_allocation_id_ == kMaxAllocationID) + static constexpr uint32_t maxAllocationID = (1 << TaggedPointer::AllocationIDBits) - 1; + if (m_currentAllocationId == maxAllocationID) { // We've reached the maximum number of allocations! return absl::nullopt; } - ++current_allocation_id_; - return current_allocation_id_; + ++m_currentAllocationId; + return m_currentAllocationId; } void DmlReservedResourceSubAllocator::ReleaseAllocationID(uint32_t id) { // The mutex must already be held - assert(!mutex_.try_lock()); + assert(!m_mutex.try_lock()); // Add it to the pool of free IDs - free_allocation_ids_.push_back(id); + m_freeAllocationIds.push_back(id); } D3D12BufferRegion DmlReservedResourceSubAllocator::CreateBufferRegion( void* opaquePointer, - uint64_t size_in_bytes) + uint64_t sizeInBytes) { auto taggedPointer = TaggedPointer::Unpack(opaquePointer); // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); + std::unique_lock lock(m_mutex); // Find the allocation corresponding to this pointer - auto it = allocations_by_id_.find(taggedPointer.allocation_id); - ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end()); + auto it = m_allocationsById.find(taggedPointer.allocationId); + ORT_THROW_HR_IF(E_INVALIDARG, it == m_allocationsById.end()); // Make sure that we are aligned to 4 bytes to satisfy DML's requirements constexpr uint64_t DML_ALIGNMENT = 4; - size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; + sizeInBytes = (1 + (sizeInBytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT; // Make sure the region we're trying to create fits entirely in the resource - assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes); + assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + sizeInBytes); return D3D12BufferRegion( taggedPointer.offset, - size_in_bytes, + sizeInBytes, it->second->GetD3D12Resource()); } @@ -375,10 +361,10 @@ namespace Dml auto taggedPointer = TaggedPointer::Unpack(opaquePointer); // We need to access (mutable) state after this point, so we need to lock - std::unique_lock lock(mutex_); + std::unique_lock lock(m_mutex); // Find the allocation corresponding to this pointer - auto it = allocations_by_id_.find(taggedPointer.allocation_id); + auto it = m_allocationsById.find(taggedPointer.allocationId); return it->second.Get(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index 3f2f1c9210c64..62b1f5b113ae4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -43,16 +43,16 @@ namespace Dml // Maximum size of a heap (in tiles) when allocations are tiled. Each tile // is 64KB. A default size of 512 tiles (32MB) does a good job of handling // local video memory fragmentation without requiring lots of heaps. - static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512; + static constexpr uint64_t DefaultMaxHeapSizeInTiles = 512; DmlReservedResourceSubAllocator( ID3D12Device* device, std::shared_ptr context, ID3D12CommandQueue* queue, - const D3D12_HEAP_PROPERTIES& heap_props, - D3D12_HEAP_FLAGS heap_flags, - D3D12_RESOURCE_FLAGS resource_flags, - D3D12_RESOURCE_STATES initial_state); + const D3D12_HEAP_PROPERTIES& heapProps, + D3D12_HEAP_FLAGS heapFlags, + D3D12_RESOURCE_FLAGS resourceFlags, + D3D12_RESOURCE_STATES initialState); // Creates a reserved or placed resource buffer over the given memory range. // The physical D3D12 resource may be larger than the requested size, so @@ -61,13 +61,13 @@ namespace Dml // the ID3D12Resource is cached, so this call typically has a lower cost // than a call to ID3D12Device::CreatePlacedResource or // CreateReservedResource. - D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes); + D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes); AllocationInfo* GetAllocationInfo(void* opaquePointer); void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) final; uint64_t ComputeRequiredSize(size_t size); - bool TilingEnabled() const { return tiling_enabled_; }; + bool TilingEnabled() const { return m_tilingEnabled; }; uint64_t GetUniqueId(void* opaquePointer); ~DmlReservedResourceSubAllocator(); @@ -84,7 +84,7 @@ namespace Dml void Free(void* p); private: - static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB + static constexpr uint32_t MinResourceSizeExponent = 16; // 2^16 = 64KB // The pool consists of a number of buckets, and each bucket contains a number of resources of the same size. // The resources in each bucket are always sized as a power of two, and each bucket contains resources twice @@ -106,7 +106,7 @@ namespace Dml friend class AllocationInfo; std::vector m_pool; - size_t m_currentAllocationId = 0; + size_t m_currentUniqueAllocationId = 0; uint64_t m_currentResourceId = 0; std::unique_ptr m_subAllocator; @@ -115,35 +115,35 @@ namespace Dml std::map m_outstandingAllocationsById; #endif - std::mutex mutex_; + std::mutex m_mutex; Microsoft::WRL::ComPtr m_device; std::shared_ptr m_context; - Microsoft::WRL::ComPtr queue_; - const D3D12_HEAP_PROPERTIES heap_properties_; - const D3D12_HEAP_FLAGS heap_flags_; - const D3D12_RESOURCE_FLAGS resource_flags_; - const D3D12_RESOURCE_STATES initial_state_; - bool tiling_enabled_; - uint64_t max_heap_size_in_tiles_; + Microsoft::WRL::ComPtr m_queue; + const D3D12_HEAP_PROPERTIES m_heapProperties; + const D3D12_HEAP_FLAGS m_heapFlags; + const D3D12_RESOURCE_FLAGS m_resourceFlags; + const D3D12_RESOURCE_STATES m_initialState; + bool m_tilingEnabled; + uint64_t m_maxHeapSizeInTiles; // The largest allocation ID we've returned so far (or 0 if we've never done // so). Note that our allocation IDs start at 1 (not 0) to ensure that it // isn't possible for a valid allocation to have a pointer value of // 0x00000000. - uint32_t current_allocation_id_ = 0; + uint32_t m_currentAllocationId = 0; // A list of unused allocation IDs. This is for re-use of IDs once they get // freed. We only bump the max_allocation_id_ once there are no more free // IDs. - std::vector free_allocation_ids_; + std::vector m_freeAllocationIds; - absl::optional TryCreateTiledAllocation(uint64_t size_in_bytes); - absl::optional TryCreateUntiledAllocation(uint64_t size_in_bytes); + absl::optional TryCreateTiledAllocation(uint64_t sizeInBytes); + absl::optional TryCreateUntiledAllocation(uint64_t sizeInBytes); friend class D3D12BufferRegion; - absl::flat_hash_map> allocations_by_id_; + absl::flat_hash_map> m_allocationsById; // Retrieves a free allocation ID, or nullopt if no more IDs are available. absl::optional TryReserveAllocationID(); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h index de42157645bba..e278ecbeb7415 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h @@ -17,7 +17,7 @@ namespace Dml { } - ID3D12Resource* GetD3D12Resource() const final { return m_allocation.resource_uav_state.Get(); } + ID3D12Resource* GetD3D12Resource() const final { return m_allocation.resourceUavState.Get(); } private: DmlHeapAllocation m_allocation; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp index 8f503566768a1..f823d05c45382 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp @@ -8,40 +8,36 @@ namespace Dml { /*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr) { - uint64_t ptr_val = reinterpret_cast(ptr); + uint64_t ptrVal = reinterpret_cast(ptr); - static constexpr uint64_t kAllocationIDMask = - (1ull << kAllocationIDBits) - 1; - static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1; + static constexpr uint64_t allocationIDMask = (1ull << AllocationIDBits) - 1; + static constexpr uint64_t offsetMask = (1ull << OffsetBits) - 1; - TaggedPointer tagged_ptr; - tagged_ptr.device_id = (ptr_val >> (kAllocationIDBits + kOffsetBits)); - tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask; - tagged_ptr.offset = (ptr_val & kOffsetMask); + TaggedPointer taggedPtr; + taggedPtr.deviceId = (ptrVal >> (AllocationIDBits + OffsetBits)); + taggedPtr.allocationId = (ptrVal >> OffsetBits) & allocationIDMask; + taggedPtr.offset = (ptrVal & offsetMask); - return tagged_ptr; + return taggedPtr; } -/*static*/ void* TaggedPointer::Pack( - uint32_t device_id, - uint32_t allocation_id, - uint64_t offset) +/*static*/ void* TaggedPointer::Pack(uint32_t deviceId, uint32_t allocationId, uint64_t offset) { - assert(device_id < (1ull << kDeviceIDBits)); - assert(allocation_id < (1ull << kAllocationIDBits)); - assert(offset < (1ull << kOffsetBits)); + assert(deviceId < (1ull << DeviceIDBits)); + assert(allocationId < (1ull << AllocationIDBits)); + assert(offset < (1ull << OffsetBits)); // Store the device ID in the upper bits of the pointer, followed by the // allocation id and the offset in the lower bits - uint64_t ptr = ((uint64_t)device_id << (kAllocationIDBits + kOffsetBits)) | - ((uint64_t)allocation_id << kOffsetBits) | offset; + uint64_t ptr = ((uint64_t)deviceId << (AllocationIDBits + OffsetBits)) | + ((uint64_t)allocationId << OffsetBits) | offset; return reinterpret_cast(ptr); } uint64_t TaggedPointer::GetUniqueId() const { - return reinterpret_cast(TaggedPointer::Pack(device_id, allocation_id, offset)); + return reinterpret_cast(TaggedPointer::Pack(deviceId, allocationId, offset)); } } // namespace tfdml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h index ee58e23a6396f..d49e9d92eeb82 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h @@ -16,18 +16,15 @@ namespace Dml // must be done using masks and shifts. struct TaggedPointer { - static constexpr uint64_t kDeviceIDBits = 4; - static constexpr uint64_t kAllocationIDBits = 20; - static constexpr uint64_t kOffsetBits = 40; - - uint64_t device_id : kDeviceIDBits; - uint64_t allocation_id : kAllocationIDBits; - uint64_t offset : kOffsetBits; - - static void* Pack( - uint32_t device_id, - uint32_t allocation_id, - uint64_t offset); + static constexpr uint64_t DeviceIDBits = 4; + static constexpr uint64_t AllocationIDBits = 20; + static constexpr uint64_t OffsetBits = 40; + + uint64_t deviceId : DeviceIDBits; + uint64_t allocationId : AllocationIDBits; + uint64_t offset : OffsetBits; + + static void* Pack(uint32_t deviceId, uint32_t allocationId, uint64_t offset); static TaggedPointer Unpack(const void* ptr); uint64_t GetUniqueId() const; }; @@ -36,9 +33,7 @@ static_assert( sizeof(TaggedPointer) == sizeof(void*), "DML requires a 64-bit architecture"); static_assert( - TaggedPointer::kDeviceIDBits + TaggedPointer::kAllocationIDBits + - TaggedPointer::kOffsetBits == - sizeof(void*) * CHAR_BIT, + TaggedPointer::DeviceIDBits + TaggedPointer::AllocationIDBits + TaggedPointer::OffsetBits == sizeof(void*) * CHAR_BIT, "DML requires a 64-bit architecture"); } // namespace tfdml From 216fc395c045bf1f38430fc860aae0bb1efe3202 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Sun, 6 Aug 2023 14:04:52 -0700 Subject: [PATCH 68/76] Fix indentation --- .../src/ExecutionContext.cpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index 86f964651b638..9a8ad4b4e6745 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -46,16 +46,17 @@ namespace Dml D3D12_HEAP_PROPERTIES heapProperties = { D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; - D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER, - 0, - byteCount, - 1, - 1, - 1, - DXGI_FORMAT_UNKNOWN, - {1, 0}, - D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; + D3D12_RESOURCE_DESC resourceDesc = { + D3D12_RESOURCE_DIMENSION_BUFFER, + 0, + byteCount, + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + {1, 0}, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; ComPtr intermediateBuffer; ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource( From a25b40cfe7adf67e73d68d3b6711e51f2ceb3e94 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 7 Aug 2023 17:09:58 -0700 Subject: [PATCH 69/76] WIP --- .../core/providers/dml/dml_provider_factory.h | 12 -- .../inc/DmlExecutionProvider.h | 1 - .../src/ExecutionProvider.cpp | 6 - .../providers/dml/dml_provider_factory.cc | 35 ----- .../Api.Image/TensorToVideoFrameConverter.cpp | 109 ++++++------- .../Api.Image/VideoFrameToTensorConverter.cpp | 53 +++---- .../inc/TensorToVideoFrameConverter.h | 19 +-- .../inc/VideoFrameToTensorConverter.h | 5 +- winml/lib/Api.Ort/OnnxruntimeEngine.cpp | 26 ++- winml/lib/Api.Ort/OnnxruntimeEngine.h | 2 +- winml/lib/Api/ImageFeatureValue.cpp | 148 +++++++++--------- winml/lib/Api/ImageFeatureValue.h | 2 +- winml/lib/Api/impl/TensorBase.h | 86 +++++----- winml/lib/Common/inc/iengine.h | 2 +- winml/test/common/SqueezeNetValidator.cpp | 34 ++-- .../cppwinrt/scenariotestscppwinrt.cpp | 1 - 16 files changed, 223 insertions(+), 318 deletions(-) diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h index 2ec3a10b08aed..0782d2d9ed760 100644 --- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h +++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h @@ -99,18 +99,6 @@ struct OrtDmlApi { * This API gets the D3D12 resource when an OrtValue has been allocated by the DML EP. */ ORT_API2_STATUS(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* provider, _In_ void* dml_resource, _Out_ ID3D12Resource** d3d_resource); - - /** - * GetD3D12ResourceRegionFromAllocation - * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP. - * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed - */ - ORT_API2_STATUS(GetD3D12ResourceRegionFromAllocation, - _In_ OrtAllocator* provider, - _In_ void* dml_resource, - _In_ uint64_t size_in_bytes, - _Out_ ID3D12Resource** d3d_resource, - _Out_ uint64_t* offset); }; #ifdef __cplusplus diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index 9ecfec4139756..decf15b194d64 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -32,7 +32,6 @@ namespace Dml bool enableMetacommands, bool enableBfcAllocator); - D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes); void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index d824aa8185705..5c5f8ebf2c3d1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -1137,12 +1137,6 @@ namespace Dml return std::make_unique(dmlDevice, commandQueue, enableMetacommands, enableBfcAllocator); } - D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes) - { - Dml::DmlGpuAllocator* gpuAllocator = static_cast(allocator); - return gpuAllocator->CreateBufferRegion(opaquePointer, sizeInBytes); - } - void FlushContext(onnxruntime::IExecutionProvider* provider) { ExecutionProvider* dmlexecutionprovider = static_cast(provider); diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index c5f12558e2f63..a3e1b9b040e6e 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -252,47 +252,12 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc API_IMPL_END } -ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation, - _In_ OrtAllocator* ort_allocator, - _In_ void* allocation, - _In_ uint64_t size_in_bytes, - _Out_ ID3D12Resource** d3d_resource, - _Out_ uint64_t* offset) { - API_IMPL_BEGIN -#ifdef USE_DML - auto wrapping_allocator = static_cast(ort_allocator); - auto allocator = wrapping_allocator->GetWrappedIAllocator(); - if (!allocator) { - return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available"); - } - - if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) { - *d3d_resource = static_cast(allocation)->GetD3D12Resource(); - *offset = 0; - } else { - ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT); - auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), allocation, size_in_bytes); - *offset = bufferRegion.Offset(); - *d3d_resource = bufferRegion.GetD3D12Resource(); - } - - (*d3d_resource)->AddRef(); - -#else - *d3d_resource = nullptr; - *offset = 0; -#endif // USE_DML - return nullptr; - API_IMPL_END -} - static constexpr OrtDmlApi ort_dml_api_10_to_x = { &OrtSessionOptionsAppendExecutionProvider_DML, &OrtSessionOptionsAppendExecutionProviderEx_DML, &CreateGPUAllocationFromD3DResource, &FreeGPUAllocation, &GetD3D12ResourceFromAllocation, - &GetD3D12ResourceRegionFromAllocation, }; const OrtDmlApi* GetOrtDmlApi(_In_ uint32_t /*version*/) NO_EXCEPTION { diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp index 6978687f226bf..2654885d6bee8 100644 --- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp +++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api.Image/pch.h" @@ -128,7 +128,6 @@ class ConvertCPUTensorToVideoFrameWithSoftwareBitmapTelemetryEvent { }; void TensorToVideoFrameConverter::DX12TensorToVideoFrame( - _In_ uint64_t inputTensorOffset, _In_ UINT32 batchIdx, _In_ winml::LearningModelSession& session, _In_ ID3D12Resource* pInputTensor, @@ -144,20 +143,16 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( wgi::SoftwareBitmap softwareBitmap = destVideoFrame.SoftwareBitmap(); if (softwareBitmap) { - ConvertGPUTensorToSoftwareBitmap( - inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap - ); + ConvertGPUTensorToSoftwareBitmap(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap); } else if (spDestDirect3DSurface) { bool isUAVSupportedFormat = _winmli::FormatSupportedForUAV( pDeviceCache->GetD3D12Device(), _winmli::GetDXGIFormatFromDirectXPixelFormat(spDestDirect3DSurface.Description().Format) ); - // UAV support for formats is device dependent + // UAV support for formats is device dependent if (!isUAVSupportedFormat) { - ConvertDX12TensorToUnsupportedVideoFrameFormat( - inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame - ); + ConvertDX12TensorToUnsupportedVideoFrameFormat(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame); } else { ComPtr spVideoFrameTexture = _winmli::GetTextureFromDirect3DSurface(destVideoFrame.Direct3DSurface()); @@ -167,7 +162,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( wgi::BitmapBounds bounds = {0, 0, videoFrameTextureDesc.Width, videoFrameTextureDesc.Height}; if (_winmli::TextureIsOnDevice(spVideoFrameTexture.Get(), pDeviceCache->GetD3D11Device())) { - // The texture is on our device, so we can just create own texture, share it and cache it + // The texture is on our device, so we can just create own texture, share it and cache it if (!output_resource_) { output_resource_ = CreateShareableD3D12Texture(videoFrameTextureDesc, pDeviceCache->GetD3D12Device()); D3D11_cached_texture_ = ShareD3D12Texture(output_resource_.Get(), pDeviceCache->GetD3D11Device()); @@ -177,24 +172,22 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( if (cachedTextureDesc.Width != videoFrameTextureDesc.Width || cachedTextureDesc.Height != videoFrameTextureDesc.Height || cachedTextureDesc.Format != videoFrameTextureDesc.Format) { - // The dimensions or format don't match, so we need to re-create our texture + // The dimensions or format don't match, so we need to re-create our texture output_resource_ = CreateShareableD3D12Texture(videoFrameTextureDesc, pDeviceCache->GetD3D12Device()); D3D11_cached_texture_ = ShareD3D12Texture(output_resource_.Get(), pDeviceCache->GetD3D11Device()); } } - // Detensorize - ConvertGPUTensorToDX12Texture( - inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get() - ); + // Detensorize + ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()); - // Make sure that detensorization is done + // Make sure that detensorization is done SyncD3D12ToD3D11(*pDeviceCache, D3D11_cached_texture_.Get()); - // Finally, copy the detensorized texture to the user's device + // Finally, copy the detensorized texture to the user's device CopyTextureIntoTexture(D3D11_cached_texture_.Get(), bounds, spVideoFrameTexture.Get()); } else { - // We are not on the same device, so we can't rely on our own cached texture + // We are not on the same device, so we can't rely on our own cached texture ComPtr spTextureDevice; spVideoFrameTexture->GetDevice(&spTextureDevice); @@ -209,11 +202,11 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( !spSharedD3D11Texture.Get()) || (FAILED(spVideoFrameTexture->GetPrivateData(_handleGUID, &handleSize, &sharedHandle)) || sharedHandle != shared_handle_)) { - // Create a new shared texture that we cache on the video frame texture + // Create a new shared texture that we cache on the video frame texture output_resource_ = CreateShareableD3D12Texture(videoFrameTextureDesc, pDeviceCache->GetD3D12Device()); spSharedD3D11Texture = ShareD3D12Texture(output_resource_.Get(), spTextureDevice.Get()); - // Cache the shared texture on the video frame texture in order to tie their lifetime together + // Cache the shared texture on the video frame texture in order to tie their lifetime together WINML_THROW_IF_FAILED( spVideoFrameTexture->SetPrivateDataInterface(_d3d11TextureGUID, spSharedD3D11Texture.Get()) ); @@ -222,20 +215,18 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( ); } - // Detensorize - ConvertGPUTensorToDX12Texture( - inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get() - ); + // Detensorize + ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()); - // Make sure that detensorization is done + // Make sure that detensorization is done SyncD3D12ToD3D11(*pDeviceCache, spSharedD3D11Texture.Get()); - // Finally, copy the detensorized texture to the user's device + // Finally, copy the detensorized texture to the user's device CopyTextureIntoTexture(spSharedD3D11Texture.Get(), bounds, spVideoFrameTexture.Get()); } } } else { - // Invalid video frame + // Invalid video frame WINML_THROW_HR(E_INVALIDARG); } } @@ -266,7 +257,6 @@ ComPtr TensorToVideoFrameConverter::CreateShareableD3D12Texture( } void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat( - _In_ uint64_t input_tensor_offset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputTensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -275,7 +265,7 @@ void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat ) { assert(pInputTensor != nullptr); - // Find the first supported format and convert to it + // Find the first supported format and convert to it auto supportedFormatIter = std::find_if( _winmli::supportedWinMLFormats.begin(), _winmli::supportedWinMLFormats.end(), @@ -321,15 +311,13 @@ void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat )); converted_video_frame_ = wm::VideoFrame::CreateWithDirect3D11Surface(surface); - // Detensorize - ConvertGPUTensorToDX12Texture( - input_tensor_offset, batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get() - ); + // Detensorize + ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get()); - // Wait for the D3D12 work to complete before using the resource + // Wait for the D3D12 work to complete before using the resource SyncD3D12ToD3D11(device_cache, spSharedD3D11Texture.Get()); - // Finally, convert and copy the texture to the destination video frame + // Finally, convert and copy the texture to the destination video frame converted_video_frame_.CopyToAsync(unsupportedVideoFrame).get(); } @@ -371,13 +359,13 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame( UINT32 tensorHeight = static_cast(tensorDesc.sizes[2]); UINT32 tensorWidth = static_cast(tensorDesc.sizes[3]); - // create a bitmap bounds for the whole image/tensor + // create a bitmap bounds for the whole image/tensor wgi::BitmapBounds inputBounds = {0, 0, tensorWidth, tensorHeight}; wgi::SoftwareBitmap spOutputSoftwareBitmap = pDestVideoFrame.SoftwareBitmap(); wgdx::Direct3D11::IDirect3DSurface spOutputSurface = pDestVideoFrame.Direct3DSurface(); - // only one of softwarebitmap or direct3Dsurface should be non-null + // only one of softwarebitmap or direct3Dsurface should be non-null if ((spOutputSoftwareBitmap == nullptr && spOutputSurface == nullptr) || (spOutputSoftwareBitmap != nullptr && spOutputSurface != nullptr)) { WINML_THROW_HR(E_INVALIDARG); } @@ -416,7 +404,6 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame( } void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( - _In_ uint64_t inputTensorOffset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputResource, _In_ _winml::D3DDeviceCache& device_cache, @@ -433,7 +420,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( CD3DX12_RECT scissorRect(0, 0, (LONG)outputDesc.Width, outputDesc.Height); ComPtr spDx12Device = device_cache.GetD3D12Device(); - // we're inside a lock from the caller of this function, so it's ok to use this static + // we're inside a lock from the caller of this function, so it's ok to use this static static EventTimer eventTimer; std::optional telemetryLogger; if (eventTimer.Start()) { @@ -448,7 +435,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( outputDesc.Format ); - // Validate input description + // Validate input description WINML_THROW_HR_IF_FALSE_MSG( E_INVALIDARG, inputDesc.Height != 0, "Invalid input image height provided. Height is set to zero." ); @@ -456,7 +443,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( E_INVALIDARG, inputDesc.Width != 0, "Invalid input image height provided. Height is set to zero." ); - // Validate output description + // Validate output description WINML_THROW_HR_IF_FALSE_MSG( E_INVALIDARG, outputDesc.Height != 0, "Invalid input image height provided. Height is set to zero." ); @@ -464,7 +451,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( E_INVALIDARG, outputDesc.Width != 0, "Invalid input image height provided. Height is set to zero." ); - // Validate Tensor description + // Validate Tensor description WINML_THROW_HR_IF_FALSE_MSG( E_INVALIDARG, tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16, @@ -504,10 +491,10 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( (UINT)outputDesc.Width ); - // Create descriptor heaps + // Create descriptor heaps UINT srvUavDescriptorSize = spDx12Device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); - // Create a UAV resource for the shader + // Create a UAV resource for the shader D3D12_RESOURCE_DESC outputResourceDesc = output_resource_->GetDesc(); outputResourceDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; @@ -524,7 +511,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( } if (descriptor_heap_ == nullptr) { - // Describe and create a shader resource view (SRV) and unordered access view (UAV) descriptor heap. + // Describe and create a shader resource view (SRV) and unordered access view (UAV) descriptor heap. D3D12_DESCRIPTOR_HEAP_DESC srvUavHeapDesc = {}; srvUavHeapDesc.NumDescriptors = DescriptorCount; srvUavHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; @@ -533,9 +520,9 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( descriptor_heap_->SetName(L"Detensorize Descriptor Heap"); } - // Create SRV and UAV for input and output respectively + // Create SRV and UAV for input and output respectively { - D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(inputTensorOffset, batchIdx, inputDesc, tensorDesc); + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(batchIdx, inputDesc, tensorDesc); CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle( descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize ); @@ -550,15 +537,15 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( spDx12Device->CreateUnorderedAccessView(UAV_resource_.Get(), nullptr, &uavDesc, uavHandle); } - // - // Pipeline setup for shader operation - // + // + // Pipeline setup for shader operation + // PipelineStateCacheType type = PipelineStateCacheType::kFloat32; if (tensorDesc.dataType == kImageTensorDataTypeFloat16) { type = PipelineStateCacheType::kFloat16; } - // Set the origin format + // Set the origin format PipelineStateCacheFormat formatFrom = PipelineStateCacheFormat::kBGR8; if (tensorDesc.channelType == kImageTensorChannelTypeRGB8) { formatFrom = PipelineStateCacheFormat::kRGB8; @@ -566,7 +553,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( formatFrom = PipelineStateCacheFormat::kGRAY8; } - // Set the destination format + // Set the destination format PipelineStateCacheFormat formatTo = PipelineStateCacheFormat::kBGR8; if (outputDesc.Format == DXGI_FORMAT_R8G8B8A8_UNORM) { formatTo = PipelineStateCacheFormat::kRGB8; @@ -580,7 +567,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( ResetCommandList(device_cache); - // Write compute commands into the command list and put it into the queue. + // Write compute commands into the command list and put it into the queue. { command_list_->SetComputeRootSignature(root_signature_.Get()); @@ -647,7 +634,6 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture( } void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( - _In_ uint64_t inputTensorOffset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputTensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -664,9 +650,9 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( telemetryLogger.emplace(tensorDesc); } - uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; - uint64_t singleVideoFramebufferSize = - tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize; + uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; + uint32_t singleVideoFramebufferSize = + static_cast(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize); // TODO: Make an allocator for readback heaps if (!readback_heap_ || readback_heap_->GetDesc().Width < singleVideoFramebufferSize) { @@ -691,7 +677,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap( readback_heap_.Get(), 0, pInputTensor, - inputTensorOffset + singleVideoFramebufferSize * static_cast(batchIdx), + static_cast(singleVideoFramebufferSize) * batchIdx, singleVideoFramebufferSize ); @@ -766,10 +752,7 @@ void TensorToVideoFrameConverter::ConvertBatchedDX12TensorToBuffers( } D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor( - uint64_t offset, - const UINT32 batchIdx, - const D3D12_RESOURCE_DESC& resourceDesc, - const _winml::ImageTensorDescription& desc + const UINT32 batchIdx, const D3D12_RESOURCE_DESC& resourceDesc, const _winml::ImageTensorDescription& desc ) { UINT uiTensorElementSize = desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t); @@ -777,7 +760,7 @@ D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER; UINT singleImageSize = static_cast(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]); - srvDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; + srvDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; srvDesc.Buffer.NumElements = singleImageSize; srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE; diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 5ed73a477b32b..b856c6bdbfeca 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api.Image/pch.h" @@ -137,7 +137,7 @@ void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor( wgi::SoftwareBitmap spInputSoftwareBitmap = inputVideoFrame.SoftwareBitmap(); wgdx::Direct3D11::IDirect3DSurface spInputSurface = inputVideoFrame.Direct3DSurface(); - // only one of softwarebitmap or direct3Dsurface should be non-null + // only one of softwarebitmap or direct3Dsurface should be non-null if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) { WINML_THROW_IF_FAILED(E_INVALIDARG); } @@ -151,7 +151,7 @@ void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor( ); } - // Resize the input VideoFrame to converted_video_frame_ + // Resize the input VideoFrame to converted_video_frame_ _winmli::ConvertVideoFrameToVideoFrame( inputVideoFrame, inputBounds, tensorWidth, tensorHeight, converted_video_frame_ ); @@ -190,7 +190,6 @@ ComPtr VideoFrameToTensorConverter::ShareD3D11Texture( } void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( - _In_ uint64_t outputTensorOffset, _In_ const UINT32 batchIdx, _In_ winml::LearningModelSession& session, _In_ const wm::IVideoFrame& inputVideoFrame, @@ -198,7 +197,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( _In_ const ImageTensorDescription& tensorDesc, _Inout_ ID3D12Resource* pOutputTensor ) { - // Validate Tensor description + // Validate Tensor description WINML_THROW_HR_IF_FALSE_MSG( E_INVALIDARG, tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16, @@ -230,9 +229,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( wgdx::Direct3D11::IDirect3DSurface spDirect3DSurface = inputVideoFrame.Direct3DSurface(); if (inputVideoFrame.SoftwareBitmap()) { - ConvertSoftwareBitmapToGPUTensor( - batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, outputTensorOffset, pOutputTensor - ); + ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, pOutputTensor); } else if (spDirect3DSurface) { ComPtr spVideoFrameTexture; wgi::BitmapBounds scaledBounds = inputBounds; @@ -320,9 +317,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( // We cropped the texture, shared it and converted it to a known color format, so it's time to tensorize // TODO: merge all videoframes to a single DX12Texture Resource before call ConvertDX12TextureToGPUTensor. - ConvertDX12TextureToGPUTensor( - outputTensorOffset, batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor - ); + ConvertDX12TextureToGPUTensor(batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor); } else { // Invalid video frame WINML_THROW_IF_FAILED(E_INVALIDARG); @@ -330,7 +325,6 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( } void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( - _In_ uint64_t output_resource_offset, _In_ UINT32 batchIdx, _In_ ID3D12Resource* pInputResource, _In_ _winml::D3DDeviceCache& device_cache, @@ -412,6 +406,11 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( // Validate Tensor Resource { + D3D12_HEAP_PROPERTIES outputHeapProperties; + D3D12_HEAP_FLAGS outputHeapFlags; + + WINML_THROW_IF_FAILED(pOutputResource->GetHeapProperties(&outputHeapProperties, &outputHeapFlags)); + UINT64 ullNumElementsTensor = 1; for (UINT uiIdx = 0; uiIdx < kImageTensorDimensionCountMax; uiIdx++) { WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, tensorDesc.sizes[uiIdx], &ullNumElementsTensor)); @@ -423,10 +422,10 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( UINT64 ullTensorSize = 0; WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize)); - if (outputDesc.Width < output_resource_offset + ullTensorSize || - outputDesc.Height != 1 || - outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || - !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)) { + if (outputDesc.Width < ullTensorSize || outputDesc.Height != 1 || + outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || + !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) || + outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) { WINML_THROW_IF_FAILED(E_INVALIDARG); } } @@ -467,7 +466,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( ); spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle); - D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(output_resource_offset, batchIdx, tensorDesc); + D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(batchIdx, outputDesc, tensorDesc); CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle( descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), UavBufferIdx, srvUavDescriptorSize ); @@ -550,7 +549,6 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( _In_ _winml::D3DDeviceCache& device_cache, _In_ const wgi::BitmapBounds& inputBounds, _In_ const ImageTensorDescription& tensorDesc, - _In_ uint64_t outputResourceOffset, _Inout_ ID3D12Resource* pOutputResource ) { assert(pOutputResource != nullptr); @@ -593,8 +591,11 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( assert(convertedSoftwareBitmap != nullptr); - uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; - uint64_t bufferSize = tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize; + D3D12_RESOURCE_DESC outputDesc = pOutputResource->GetDesc(); + + uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2; + uint32_t bufferSize = + static_cast(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize); // TODO: Make an allocator for upload heaps if (!upload_heap_ || upload_heap_->GetDesc().Width < bufferSize) { @@ -625,13 +626,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( ); command_list_->ResourceBarrier(1, &barrier); - command_list_->CopyBufferRegion( - pOutputResource, - bufferSize * static_cast(batchIdx) + outputResourceOffset, - upload_heap_.Get(), - 0, - bufferSize - ); + command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize); WINML_THROW_IF_FAILED(command_list_->Close()); ID3D12CommandList* ppCommandLists[] = {command_list_.Get()}; @@ -689,14 +684,14 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor( } D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescription( - uint64_t offset, const UINT32 batchIdx, const _winml::ImageTensorDescription& desc + const UINT32 batchIdx, const D3D12_RESOURCE_DESC& resourceDesc, const _winml::ImageTensorDescription& desc ) { UINT uiTensorElementSize = desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t); D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; UINT singleImageSize = static_cast(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]); - uavDesc.Buffer.FirstElement = offset / uiTensorElementSize + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; + uavDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3]; uavDesc.Buffer.NumElements = singleImageSize; uavDesc.Buffer.CounterOffsetInBytes = 0; uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE; diff --git a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h index b82fc8e7a5133..12f676459293b 100644 --- a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h +++ b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #pragma once @@ -12,10 +12,9 @@ class TensorToVideoFrameConverter : public ImageConverter { public: TensorToVideoFrameConverter() : shared_handle_(nullptr) {} - // Function takes in a tensor DX12 Resource all compute ops should be completed - // converts it to a VideoFrame backed by either a SoftwareBitmap or D3DSurface + // Function takes in a tensor DX12 Resource all compute ops should be completed + // converts it to a VideoFrame backed by either a SoftwareBitmap or D3DSurface void DX12TensorToVideoFrame( - _In_ uint64_t inputTensorOffset, _In_ UINT32 batch_index, _In_ winml::LearningModelSession& session, _In_ ID3D12Resource* input_tensor, @@ -23,8 +22,8 @@ class TensorToVideoFrameConverter : public ImageConverter { _Inout_ wm::VideoFrame& destination_video_frame ); - // Function takes in a byte pointer to a CPUTensor - // converts it to VideoFrame backed by either a SoftwareBitmap or D3DSurface, + // Function takes in a byte pointer to a CPUTensor + // converts it to VideoFrame backed by either a SoftwareBitmap or D3DSurface, void SoftwareTensorToVideoFrame( _In_ winml::LearningModelSession& session, _In_ BYTE* CPU_tensor_to_convert, @@ -58,7 +57,6 @@ class TensorToVideoFrameConverter : public ImageConverter { Microsoft::WRL::ComPtr ShareD3D12Texture(ID3D12Resource* pResource, ID3D11Device* pDevice); void ConvertGPUTensorToSoftwareBitmap( - _In_ uint64_t inputTensorOffset, _In_ UINT32 batch_index, _In_ ID3D12Resource* input_tensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -67,7 +65,6 @@ class TensorToVideoFrameConverter : public ImageConverter { ); void ConvertGPUTensorToDX12Texture( - _In_ uint64_t inputTensorOffset, _In_ UINT32 batch_index, _In_ ID3D12Resource* input_resource, _In_ _winml::D3DDeviceCache& device_cache, @@ -76,7 +73,6 @@ class TensorToVideoFrameConverter : public ImageConverter { ); void ConvertDX12TensorToUnsupportedVideoFrameFormat( - _In_ uint64_t input_tensor_offset, _In_ UINT32 batch_index, _In_ ID3D12Resource* input_tensor, _In_ _winml::D3DDeviceCache& device_cache, @@ -85,10 +81,7 @@ class TensorToVideoFrameConverter : public ImageConverter { ); static D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor( - uint64_t offset, - const UINT32 batch_index, - const D3D12_RESOURCE_DESC& resource_description, - const ImageTensorDescription& description + const UINT32 batch_index, const D3D12_RESOURCE_DESC& resource_description, const ImageTensorDescription& description ); static void ConvertCPUTensorToSoftwareBitmap( diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h index 4433dfaab299d..e34030bbd6833 100644 --- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h +++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h @@ -21,7 +21,6 @@ class VideoFrameToTensorConverter : public ImageConverter { // {upperleft X, upperleft Y, width, height} to be turned into a tensor. // If the region of interest is the entire VideoFrame, the input BitmapBounds should describe the entire image. void VideoFrameToDX12Tensor( - _In_ uint64_t output_tensor_offset, _In_ const UINT32 batch_index, _In_ winml::LearningModelSession& session, _In_ const wm::IVideoFrame& input_video_frame, @@ -72,12 +71,10 @@ class VideoFrameToTensorConverter : public ImageConverter { _In_ _winml::D3DDeviceCache& device_cache, _In_ const wgi::BitmapBounds& input_bounds, _In_ const ImageTensorDescription& tensor_description, - _In_ uint64_t outputResourceOffset, _Inout_ ID3D12Resource* pOutputResource ); void ConvertDX12TextureToGPUTensor( - _In_ uint64_t output_resource_offset, _In_ const UINT32 batch_index, _In_ ID3D12Resource* pInputResource, _In_ _winml::D3DDeviceCache& device_cache, @@ -86,7 +83,7 @@ class VideoFrameToTensorConverter : public ImageConverter { ); static D3D12_UNORDERED_ACCESS_VIEW_DESC CreateUAVDescription( - uint64_t offset, const UINT32 batch_index, const ImageTensorDescription& description + const UINT32 batch_index, const D3D12_RESOURCE_DESC& resource_description, const ImageTensorDescription& description ); static void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor( diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp index efaa3685d45cf..4d0915ab13af8 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api.Ort/pch.h" #include "OnnxruntimeEngine.h" @@ -108,12 +108,12 @@ HRESULT OnnxruntimeValue::IsCpu(bool* out) { } static uint64_t ShapeSize(const int64_t* shape, size_t count) { - // for each dim + // for each dim int64_t size = 1; for (size_t i = 0; i < count; i++) { - // find out it's total size + // find out it's total size size *= shape[i]; - // make sure there are no invalid dimensions (-1 or any invalid shape) + // make sure there are no invalid dimensions (-1 or any invalid shape) THROW_HR_IF(E_INVALIDARG, shape[i] <= 0); } return size; @@ -134,7 +134,7 @@ static auto GetStrings( } auto length = ShapeSize(shape.data(), shape.size()); - // make a big buffer to hold all the string data + // make a big buffer to hold all the string data size_t buffer_length; THROW_IF_NOT_OK_MSG(ort_api->GetStringTensorDataLength(ort_value, &buffer_length), ort_api); @@ -146,10 +146,10 @@ static auto GetStrings( ort_api->GetStringTensorContent(ort_value, buffer.get(), buffer_length, offsets.data(), offsets.size()), ort_api ); - // now go build all the strings + // now go build all the strings for (size_t i = 0; i < length; ++i) { size_t str_len = 0; - // are we on the last one? + // are we on the last one? if (i == (length - 1)) { str_len = buffer_length - offsets[i]; } else { @@ -161,7 +161,7 @@ static auto GetStrings( return std::make_shared>(std::move(strings), std::move(buffer)); } -HRESULT OnnxruntimeValue::GetResource(uint64_t size_in_bytes, _winml::Resource& out, uint64_t& offset) { +HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) { auto ort_api = engine_->GetEngineFactory()->UseOrtApi(); void* mutable_data = nullptr; @@ -185,10 +185,7 @@ HRESULT OnnxruntimeValue::GetResource(uint64_t size_in_bytes, _winml::Resource& winrt::com_ptr resource; RETURN_HR_IF_NOT_OK_MSG( - ort_dml_api->GetD3D12ResourceRegionFromAllocation( - allocator.get(), mutable_data, size_in_bytes, resource.put(), &offset - ), - ort_api + ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, resource.put()), ort_api ); out = _winml::Resource(resource.get(), [](void*) { /*do nothing, as this pointer is actually a com pointer! */ }); } else { @@ -1406,11 +1403,10 @@ HRESULT OnnxruntimeEngine::FillFromMapValue( std::vector keys_shape; keys_value->GetTensorShape(keys_shape); - uint64_t offset = 0; _winml::Resource keys_data; - RETURN_IF_FAILED(keys_value->GetResource(0, keys_data, offset)); + RETURN_IF_FAILED(keys_value->GetResource(keys_data)); _winml::Resource values_data; - RETURN_IF_FAILED(values_value->GetResource(0, values_data, offset)); + RETURN_IF_FAILED(values_value->GetResource(values_data)); auto num_elements = static_cast(ShapeSize(keys_shape.data(), keys_shape.size())); GetAbiMapFiller(key_kind, value_kind)(map, num_elements, keys_data.get(), values_data.get()); diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h index 0fb4aa73a1a96..5974d46b82c4f 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.h +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h @@ -31,7 +31,7 @@ class OnnxruntimeValue STDMETHOD(IsCpu) (bool* out) override; STDMETHOD(GetResource) - (uint64_t size_in_bytes, _winml::Resource& resource, uint64_t& offset) override; + (_winml::Resource& resource) override; STDMETHOD(IsTensor) (bool* out) override; STDMETHOD(IsOfTensorType) diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp index 4f824c072e8a2..3e36092ad5ebe 100644 --- a/winml/lib/Api/ImageFeatureValue.cpp +++ b/winml/lib/Api/ImageFeatureValue.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api/pch/pch.h" #include "ImageFeatureValue.h" @@ -47,7 +47,7 @@ WINML_CATCH_ALL void ImageFeatureValue::Initialize() { m_batchSize = m_videoFrames.Size(); for (auto videoFrame : m_videoFrames) { - // TODO: Check all videoFrames come from either CPU or GPU. + // TODO: Check all videoFrames come from either CPU or GPU. if (auto surface = videoFrame.Direct3DSurface()) { wgdx::Direct3D11::Direct3DSurfaceDescription description = surface.Description(); m_widths.emplace_back(description.Width); @@ -148,15 +148,15 @@ wgi::BitmapBounds ImageFeatureValue::CenterAndCropBounds(uint32_t idx, uint32_t wgi::BitmapBounds bounds = {}; float RequiredAspectRatio = static_cast(desiredWidth) / static_cast(desiredHeight); - // crop to center while maintaining size + // crop to center while maintaining size if (RequiredAspectRatio * m_heights[idx] < m_widths[idx]) { - // actual width is too wide. Cut off left and right of image + // actual width is too wide. Cut off left and right of image bounds.Width = std::min((UINT)(RequiredAspectRatio * m_heights[idx] + 0.5f), m_widths[idx]); bounds.Height = m_heights[idx]; bounds.X = (m_widths[idx] - bounds.Width) / 2; bounds.Y = 0; } else { - // actual height is too long. Cut off top and bottom + // actual height is too long. Cut off top and bottom bounds.Width = m_widths[idx]; bounds.Height = std::min((UINT)(m_widths[idx] / RequiredAspectRatio + 0.5f), m_heights[idx]); bounds.X = 0; @@ -249,14 +249,14 @@ static void CPUTensorize( auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->TensorizerStore()->Fetch(descriptor)); - //apply tensorization + //apply tensorization pooledConverter->Get()->Tensorizer->VideoFrameToSoftwareTensor( videoFrame, bounds, tensorDescriptor, reinterpret_cast(pResource) ); - // Software tensorization doesnt need to hold onto any resources beyond its scope, so we can - // return the converter to the pool on tensorization completion. - // (This happens automatically in the destruction of PoolObjectWrapper) + // Software tensorization doesnt need to hold onto any resources beyond its scope, so we can + // return the converter to the pool on tensorization completion. + // (This happens automatically in the destruction of PoolObjectWrapper) } static void CPUTensorize( @@ -267,7 +267,7 @@ static void CPUTensorize( BYTE* resource, unsigned int singleFrameBufferSize ) { - // Tensorize video frames one by one without extra copy. + // Tensorize video frames one by one without extra copy. for (uint32_t batchIdx = 0; batchIdx < videoFrames.Size(); ++batchIdx) { CPUTensorize(videoFrames.GetAt(batchIdx), bounds[batchIdx], tensorDescriptor, spSession, resource); resource += singleFrameBufferSize; @@ -280,7 +280,6 @@ static void GPUTensorize( _winml::ImageTensorDescription tensorDescriptor, com_ptr spSession, ID3D12Resource* d3dResource, - uint64_t resourceOffset, _winml::BindingContext& context ) { auto spDevice = spSession->Device().as(); @@ -291,24 +290,24 @@ static void GPUTensorize( descriptor.height = static_cast(tensorDescriptor.sizes[2]); descriptor.luid = spDevice->GetD3DDevice()->GetAdapterLuid(); // Converted image on GPU - // Tensorize video frames one by one without extra copy. + // Tensorize video frames one by one without extra copy. for (uint32_t batchIdx = 0; batchIdx < videoFrames.Size(); ++batchIdx) { auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->TensorizerStore()->Fetch(descriptor)); { - // Apply tensorization + // Apply tensorization auto session = spSession.as(); pooledConverter->Get()->Tensorizer->VideoFrameToDX12Tensor( - resourceOffset, batchIdx, session, videoFrames.GetAt(batchIdx), bounds[batchIdx], tensorDescriptor, d3dResource + batchIdx, session, videoFrames.GetAt(batchIdx), bounds[batchIdx], tensorDescriptor, d3dResource ); - // Tensorization to a GPU tensor will run asynchronously and associated resources - // need to be kept alive until the gpu resources have been used in the queue. - // - // The PoolObjectWrapper needs to stay alive so that the underlying resources are - // not released to the cache. - // - // This object will be returned to the cache when evaluate has completed. So we cache this - // on the binding context. + // Tensorization to a GPU tensor will run asynchronously and associated resources + // need to be kept alive until the gpu resources have been used in the queue. + // + // The PoolObjectWrapper needs to stay alive so that the underlying resources are + // not released to the cache. + // + // This object will be returned to the cache when evaluate has completed. So we cache this + // on the binding context. context.converter = pooledConverter; } } @@ -324,13 +323,13 @@ std::optional ImageFeatureValue::GetIn auto spImageDescriptor = context.descriptor.try_as(); auto spTensorDescriptor = context.descriptor.try_as(); - // Set up descriptorWidth and descriptorHeight + // Set up descriptorWidth and descriptorHeight if (spImageDescriptor) { - // If model expects free dimensions the descritpr will have MAXUINT32, and we use the supplied image + // If model expects free dimensions the descritpr will have MAXUINT32, and we use the supplied image - // If the width or height in model metadata is -1, which means free dimension. - // The the widths and heights of input data must be the same. Or the - // tensorDescriptor cannot describ the shape of the inputs. + // If the width or height in model metadata is -1, which means free dimension. + // The the widths and heights of input data must be the same. Or the + // tensorDescriptor cannot describ the shape of the inputs. if (spImageDescriptor->Width() == MAXUINT32 && !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to()) == m_widths.end())) { THROW_HR(E_INVALIDARG); @@ -344,7 +343,7 @@ std::optional ImageFeatureValue::GetIn descriptorHeight = (spImageDescriptor->Height() == MAXUINT32) ? m_heights[0] : spImageDescriptor->Height(); tensorKind = spImageDescriptor->TensorKind(); } else if (spTensorDescriptor) { - // If model expects a tensor, use its shape + // If model expects a tensor, use its shape auto shape = spTensorDescriptor->Shape(); if (shape.Size() != 4) { @@ -370,28 +369,28 @@ std::optional ImageFeatureValue::GetIn return {}; } - // Set up BitmapBounds - // For batch of images with different sizes, like { {1, 3, 1080, 1080}, {1, 3, 720, 720} }, - // a vector of bounds is to record the result after cropped. + // Set up BitmapBounds + // For batch of images with different sizes, like { {1, 3, 1080, 1080}, {1, 3, 720, 720} }, + // a vector of bounds is to record the result after cropped. std::vector bounds = {}; for (uint32_t i = 0; i < m_batchSize; ++i) { auto tempBounds = GetBoundsFromMetadata(context.properties); if (!tempBounds.has_value()) { - // If the user has not specified bounds, we need to infer the bounds - // from the combination of descriptor, and input value or output value + // If the user has not specified bounds, we need to infer the bounds + // from the combination of descriptor, and input value or output value if (context.type == _winml::BindingType::kInput) { - // If unspecified output, get the crop with correct aspect ratio + // If unspecified output, get the crop with correct aspect ratio tempBounds = CenterAndCropBounds(i, descriptorWidth, descriptorHeight); } else { - // If given an unspecified output region, write into the top left portion of the output image. + // If given an unspecified output region, write into the top left portion of the output image. tempBounds = wgi::BitmapBounds{0, 0, m_widths[i], m_heights[i]}; } } bounds.emplace_back(tempBounds.value()); } - // TODO: Validate Bounds + // TODO: Validate Bounds - // Set up BitmapPixelFormat + // Set up BitmapPixelFormat auto pixelFormat = std::optional{}; pixelFormat = GetBitmapPixelFormatFromMetadata(context.properties); if (!pixelFormat.has_value() && spImageDescriptor) { @@ -400,23 +399,23 @@ std::optional ImageFeatureValue::GetIn auto shape = spTensorDescriptor->Shape(); int channelCount = static_cast(shape.GetAt(1)); if (channelCount == 1) { - // Assume Gray if no image descriptor is given and channelcount 1 + // Assume Gray if no image descriptor is given and channelcount 1 pixelFormat = wgi::BitmapPixelFormat::Gray8; } else if (channelCount == 3) { - // Assume Bgra8 if no image descriptor is given + // Assume Bgra8 if no image descriptor is given pixelFormat = wgi::BitmapPixelFormat::Bgra8; } else { THROW_HR(WINML_ERR_SIZE_MISMATCH); } } - // Set up LearningModelPixelRange + // Set up LearningModelPixelRange auto pixelRange = std::optional{}; pixelRange = GetBitmapPixelRangeFromMetadata(context.properties); if (pixelRange.has_value()) { - // The pixel range was set by the bind properties, skip all checks and honor - // the user provided normalization property. Do nothing. + // The pixel range was set by the bind properties, skip all checks and honor + // the user provided normalization property. Do nothing. } else if (!pixelRange.has_value() && spImageDescriptor) { pixelRange = spImageDescriptor->PixelRange(); } else if (!pixelRange.has_value() && spTensorDescriptor) { @@ -437,17 +436,17 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa FAIL_FAST_IF(!(std::all_of(m_widths.begin(), m_widths.end(), [](int i) { return i != 0; }))); FAIL_FAST_IF(!(std::all_of(m_heights.begin(), m_heights.end(), [](int i) { return i != 0; }))); - // Get image metadata from the binding context + // Get image metadata from the binding context auto metadata = GetInputMetadata(context); RETURN_HR_IF(E_INVALIDARG, !metadata); ImageResourceMetadata resourceMetadata = metadata.value(); - // Get the session + // Get the session auto spSession = context.session.as(); auto spDevice = spSession->Device().as(); auto engine = spSession->GetEngine(); - // create the OrtValue + // create the OrtValue winrt::com_ptr<_winml::IValue> value; RETURN_IF_FAILED(engine->CreateTensorValue( resourceMetadata.TensorDescriptor.sizes, @@ -458,21 +457,19 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa value.put() )); - auto bufferSize = std::accumulate( - std::begin(resourceMetadata.TensorDescriptor.sizes), - std::end(resourceMetadata.TensorDescriptor.sizes), - static_cast(1), - std::multiplies() - ); - auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize; - - // Get the tensor raw data + // Get the tensor raw data _winml::Resource void_resource; - uint64_t offset = 0; - RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset)); + RETURN_IF_FAILED(value->GetResource(void_resource)); if (context.type == _winml::BindingType::kInput) { - // Only tensorize inputs + // Only tensorize inputs + auto bufferSize = std::accumulate( + std::begin(resourceMetadata.TensorDescriptor.sizes), + std::end(resourceMetadata.TensorDescriptor.sizes), + static_cast(1), + std::multiplies() + ); + auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize; auto singleFrameBufferSize = bufferByteSize / m_batchSize; if (spDevice->IsCpuDevice()) { auto resource = reinterpret_cast(void_resource.get()); @@ -487,7 +484,7 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa } else { auto resource = reinterpret_cast(void_resource.get()); GPUTensorize( - m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, offset, context + m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, context ); } } @@ -504,28 +501,18 @@ HRESULT ImageFeatureValue::IsPlaceholder(bool* pIsPlaceHolder) { } HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& context, _winml::IValue* value) try { - // Get the device + // Get the device auto spSession = context.session.as(); auto spDevice = spSession->Device().as(); - // Get the run context + // Get the output tensor raw data + _winml::Resource void_resource; + RETURN_IF_FAILED(value->GetResource(void_resource)); + + // Get the run context auto metadata = GetInputMetadata(context); ImageResourceMetadata resourceMetadata = metadata.value(); - auto bufferSize = std::accumulate( - std::begin(resourceMetadata.TensorDescriptor.sizes), - std::end(resourceMetadata.TensorDescriptor.sizes), - static_cast(1), - std::multiplies() - ); - auto bufferByteSize = - GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize; - - // Get the output tensor raw data - _winml::Resource void_resource; - uint64_t offset = 0; - RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset)); - _winml::ConverterResourceDescription descriptor = {}; descriptor.width = static_cast(resourceMetadata.TensorDescriptor.sizes[3]); descriptor.height = static_cast(resourceMetadata.TensorDescriptor.sizes[2]); @@ -537,9 +524,18 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->DetensorizerStore()->Fetch(descriptor)); + auto bufferSize = std::accumulate( + std::begin(resourceMetadata.TensorDescriptor.sizes), + std::end(resourceMetadata.TensorDescriptor.sizes), + static_cast(1), + std::multiplies() + ); + auto bufferByteSize = + GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize; + BYTE* resource = reinterpret_cast(void_resource.get()); for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) { - // Convert Software Tensor to VideoFrame one by one based on the buffer size. + // Convert Software Tensor to VideoFrame one by one based on the buffer size. auto videoFrame = m_videoFrames.GetAt(batchIdx); pooledConverter->Get()->Detensorizer->SoftwareTensorToVideoFrame( context.session, resource, resourceMetadata.TensorDescriptor, videoFrame @@ -557,7 +553,7 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) { auto videoFrame = m_videoFrames.GetAt(batchIdx); pooledConverter->Get()->Detensorizer->DX12TensorToVideoFrame( - offset, batchIdx, context.session, d3dResource, resourceMetadata.TensorDescriptor, videoFrame + batchIdx, context.session, d3dResource, resourceMetadata.TensorDescriptor, videoFrame ); // Reset the Allocator before return to the Cache. Must Sync this background thread to that completion before we do. diff --git a/winml/lib/Api/ImageFeatureValue.h b/winml/lib/Api/ImageFeatureValue.h index 83a21c8679cf3..92f3cab43b432 100644 --- a/winml/lib/Api/ImageFeatureValue.h +++ b/winml/lib/Api/ImageFeatureValue.h @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #pragma once diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h index b8cdf6f66a587..c9299a00ddaa2 100644 --- a/winml/lib/Api/impl/TensorBase.h +++ b/winml/lib/Api/impl/TensorBase.h @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #pragma once @@ -26,27 +26,27 @@ namespace _winml { - // TensorBase - // - // This is the base class for all data based Tensor types. It exposes array and IVectorView - // based getter and setters. - // - // Look in FeatureValue.h to see where all of them actually get created with CREATE_TENSOR() - // - // Supported derived classes: - // Float, Int8, UInt8, UInt16, Int16, Int32, Int64, Boolean, Double, UInt32, UInt64 - // - // Unsupported types - // Float16 and String have different access patterns and Int8, Complex64, Complex128 are unsupported - // +// TensorBase +// +// This is the base class for all data based Tensor types. It exposes array and IVectorView +// based getter and setters. +// +// Look in FeatureValue.h to see where all of them actually get created with CREATE_TENSOR() +// +// Supported derived classes: +// Float, Int8, UInt8, UInt16, Int16, Int32, Int64, Boolean, Double, UInt32, UInt64 +// +// Unsupported types +// Float16 and String have different access patterns and Int8, Complex64, Complex128 are unsupported +// template struct TensorBase : TBase { template static void ASSERT_TEMPLATE_PARAMETERS() { - // This adds compile time checks that ensure that the API can only be called when: - // 1) the first template parameter matches the internal type (T), - // since the api attempts copy the tensor memory of type T into a vector of type ElementType. - // 2) the second template parameter matches the return type + // This adds compile time checks that ensure that the API can only be called when: + // 1) the first template parameter matches the internal type (T), + // since the api attempts copy the tensor memory of type T into a vector of type ElementType. + // 2) the second template parameter matches the return type static_assert( std::is_same::value, "This API can only be called with template parameters that match its internal data type T." @@ -59,9 +59,9 @@ struct TensorBase : TBase { template static void ASSERT_TEMPLATE_PARAMETERS_EXACT() { - // This adds compile time checks that ensure that the API can only be called when: - // 1) the conditions of ASSERT_TEMPLATE_PARAMETERS() are met. - // 2) the ABI type (ViewT) matches the internal type (t). + // This adds compile time checks that ensure that the API can only be called when: + // 1) the conditions of ASSERT_TEMPLATE_PARAMETERS() are met. + // 2) the ABI type (ViewT) matches the internal type (t). ASSERT_TEMPLATE_PARAMETERS(); static_assert( @@ -70,18 +70,18 @@ struct TensorBase : TBase { ); } - /// On creation, tensors can either: - /// 1) act as a placeholder without any backing memory (output tensors, chained values). In this case we - /// create the backing memory when the buffer is accessed. The buffer is allocated one of there scenarios: - /// GPUTensorize during binding (used to create DML resources for chaining) - /// UpdateSourceResourceData after eval (used for output placeholder tensors or unbound outputs) - /// GetBuffer when accessed by users - /// a) TensorBase() - /// 2) allocate backing cpu memory (when a shape is provided) - /// a) TensorBase(std::vector const& shape) - /// b) TensorBase(winrt::Windows::Foundation::Collections::IIterable const& shape) - /// 3) use provided backing gpu memory - /// a) TensorBase(std::vector const& shape, ID3D12Resource* pResource) + /// On creation, tensors can either: + /// 1) act as a placeholder without any backing memory (output tensors, chained values). In this case we + /// create the backing memory when the buffer is accessed. The buffer is allocated one of there scenarios: + /// GPUTensorize during binding (used to create DML resources for chaining) + /// UpdateSourceResourceData after eval (used for output placeholder tensors or unbound outputs) + /// GetBuffer when accessed by users + /// a) TensorBase() + /// 2) allocate backing cpu memory (when a shape is provided) + /// a) TensorBase(std::vector const& shape) + /// b) TensorBase(winrt::Windows::Foundation::Collections::IIterable const& shape) + /// 3) use provided backing gpu memory + /// a) TensorBase(std::vector const& shape, ID3D12Resource* pResource) TensorBase() : resources_(std::make_shared>()) {} TensorBase(wfc::IIterable const& shape) @@ -97,7 +97,7 @@ struct TensorBase : TBase { TensorBase(std::vector const& shape, ID3D12Resource* resource) : shape_(shape), resources_(std::make_shared>()) { - // This Api is not supported for TensorString + // This Api is not supported for TensorString WINML_THROW_HR_IF_TRUE_MSG( E_ILLEGAL_METHOD_CALL, (std::is_same::value), @@ -132,7 +132,7 @@ struct TensorBase : TBase { return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out); } - // If there is no matching cpu resource, then fallback to a gpu resource + // If there is no matching cpu resource, then fallback to a gpu resource if (GpuTensor() != nullptr) { return CreateGPUMLValue(GpuTensor().get(), context, out); } @@ -145,18 +145,18 @@ struct TensorBase : TBase { return CreateGPUMLValue(GpuTensor().get(), context, out); } - // Get engine + // Get engine auto session = context.session.as(); auto device = session->Device().as(); auto engine = session->GetEngine(); auto should_sync_buffer = context.type == _winml::BindingType::kInput; - // If there is no matching gpu resource, then fallback to a cpu resource + // If there is no matching gpu resource, then fallback to a cpu resource if (CpuTensor() != nullptr) { auto num_backing_buffers = CpuTensor()->num_buffers(); if (num_backing_buffers == 1) { - // If we have a single backing cpu buffer, there is no need to create GPU resources. + // If we have a single backing cpu buffer, there is no need to create GPU resources. // The engine will use the buffer provided, and perform the needed copies into the GPU context as needed. return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out); } else { @@ -374,13 +374,11 @@ struct TensorBase : TBase { "The tensor has been closed and its resources have been detached during evaluation!" ); + _winml::Resource updated_resource; + RETURN_IF_FAILED(value->GetResource(updated_resource)); + // get the shape RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!"); - auto buffer_size_in_bytes = static_cast(ShapeSize(shape_)) * sizeof(T); - - _winml::Resource updated_resource; - uint64_t offset = 0; - RETURN_IF_FAILED(value->GetResource(buffer_size_in_bytes, updated_resource, offset)); bool is_cpu; bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu; @@ -424,6 +422,8 @@ struct TensorBase : TBase { ); RETURN_IF_FAILED(engine->CopyValueAcrossDevices(value, dest.get())); } else { + auto buffer_size_in_bytes = static_cast(ShapeSize(shape_)) * sizeof(T); + _winml::ConverterResourceDescription descriptor = {}; descriptor.pixel_format = static_cast(wgdx::DirectXPixelFormat::Unknown); descriptor.luid = device->GetD3DDevice()->GetAdapterLuid(); // Converted image on GPU diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h index 0a944315a1dc4..1aa857383a3b5 100644 --- a/winml/lib/Common/inc/iengine.h +++ b/winml/lib/Common/inc/iengine.h @@ -21,7 +21,7 @@ IValue : IUnknown { (bool* out) PURE; STDMETHOD(GetResource) - (uint64_t size_in_bytes, _winml::Resource & resource, uint64_t& offset) PURE; + (_winml::Resource & resource) PURE; STDMETHOD(IsTensor) (bool* out) PURE; diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp index d0c43c1c9775e..2a6b3843c423c 100644 --- a/winml/test/common/SqueezeNetValidator.cpp +++ b/winml/test/common/SqueezeNetValidator.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "SqueezeNetValidator.h" #include "protobufHelpers.h" @@ -104,7 +104,7 @@ void ModelValidator::FnsCandy16( float dataTolerance ) { ORT_UNUSED_PARAMETER(dataTolerance); - // file name strings + // file name strings static wchar_t* modelFileName = L"winmlperf_coreml_FNS-Candy_prerelease_fp16.onnx"; static wchar_t* inputDataImageFileName = L"fish_720.png"; static wchar_t* outputDataFileName = L"output.png"; @@ -115,7 +115,7 @@ void ModelValidator::FnsCandy16( auto fullModelPath = modulePath + modelFileName; auto outputFileName = modulePath + outputDataFileName; - // WinML model creation + // WinML model creation LearningModel model = nullptr; model = LearningModel::LoadFromFilePath(fullModelPath); @@ -126,7 +126,7 @@ void ModelValidator::FnsCandy16( auto fullImagePath = modulePath + inputDataImageFileName; BindImage(modelBinding, inputBindingName, fullImagePath.c_str(), bindInputsAsIInspectable); - // create the tensor for the actual output + // create the tensor for the actual output auto output = model.OutputFeatures().First().Current(); if (output.Kind() != LearningModelFeatureKind::Tensor) { throw winrt::hresult_invalid_argument(L"Model output kind is not type Tensor"); @@ -135,16 +135,16 @@ void ModelValidator::FnsCandy16( auto shape = winrt::single_threaded_vector(std::vector{1, 1}); auto outputTensor = BindImageOutput(outputBindingStrategy, modelBinding, outputDataBindingName); - // Evaluate the model + // Evaluate the model std::cout << "Calling EvaluateSync on instance" << instance << "\n"; LearningModelEvaluationResult result = nullptr; result = modelSession.Evaluate(modelBinding, {}); - // Get results + // Get results if (outputBindingStrategy == OutputBindingStrategy::Unbound) { - // When output binding strategy is unbound, the output tensor was not set on bind. - // Therefore, we need to retrieve it from the LearnignModelEvaluationResult - // TODO: is this right? outputTensorT is unused... + // When output binding strategy is unbound, the output tensor was not set on bind. + // Therefore, we need to retrieve it from the LearnignModelEvaluationResult + // TODO: is this right? outputTensorT is unused... /*auto outputTensorT = */ result.Outputs().Lookup(outputDataBindingName).as(); } else { if (result.Outputs().Lookup(outputDataBindingName) != outputTensor) { @@ -171,7 +171,7 @@ void ModelValidator::SqueezeNet( OutputBindingStrategy outputBindingStrategy, bool bindInputsAsIInspectable ) { - // file name strings + // file name strings static wchar_t* modelFileName = L"model.onnx"; static wchar_t* inputDataFileName = L"test_data_0_input.pb"; static wchar_t* outputDataFileName = L"test_data_0_output.pb"; @@ -183,7 +183,7 @@ void ModelValidator::SqueezeNet( auto fullModelPath = modulePath + modelFileName; auto outputFileName = modulePath + outputDataFileName; - // WinML model creation + // WinML model creation LearningModel model = nullptr; model = LearningModel::LoadFromFilePath(fullModelPath); @@ -201,13 +201,13 @@ void ModelValidator::SqueezeNet( BindTensor(modelBinding, inputBindingName, inputTensor, bindInputsAsIInspectable); } - // load up the expected output + // load up the expected output auto expectedResultsTensor = ProtobufHelpers::LoadTensorFromProtobufFile(outputFileName, false); if (expectedResultsTensor == nullptr) { throw winrt::hresult_invalid_argument(L"Expected Results from protobuf file are null."); } - // create the tensor for the actual output + // create the tensor for the actual output auto output = model.OutputFeatures().First().Current(); if (output.Kind() != LearningModelFeatureKind::Tensor) { throw winrt::hresult_invalid_argument(L"Expected output feature kind of model to be Tensor"); @@ -216,15 +216,15 @@ void ModelValidator::SqueezeNet( auto outputTensor = BindOutput(outputBindingStrategy, modelBinding, outputDataBindingName, expectedResultsTensor.Shape()); - // Evaluate the model + // Evaluate the model std::cout << "Calling EvaluateSync on instance " << instance << "\n"; LearningModelEvaluationResult result = nullptr; result = modelSession.Evaluate(modelBinding, {}); - // Get results + // Get results if (outputBindingStrategy == OutputBindingStrategy::Unbound) { - // When output binding strategy is unbound, the output tensor was not set on bind. - // Therefore, we need to retrieve it from the LearnignModelEvaluationResult + // When output binding strategy is unbound, the output tensor was not set on bind. + // Therefore, we need to retrieve it from the LearnignModelEvaluationResult outputTensor = result.Outputs().Lookup(outputDataBindingName).as(); } else { if (result.Outputs().Lookup(outputDataBindingName) != outputTensor) { diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp index 3a0a91fb7e220..9b389d014c953 100644 --- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp +++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp @@ -1140,7 +1140,6 @@ static void MsftQuantizedModels() { // load a model std::wstring filePath = FileHelpers::GetModulePath() + L"coreml_Resnet50_ImageNet-dq.onnx"; LearningModel model = LearningModel::LoadFromFilePath(filePath); - LearningModelSession session(model, LearningModelDevice(LearningModelDeviceKind::DirectX)); // create a binding set LearningModelBinding binding(session); From 1a0eaa663e636ff45c518cfe90ca75efa83323e6 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 7 Aug 2023 19:33:47 -0700 Subject: [PATCH 70/76] WIP --- winml/lib/Api/ImageFeatureValue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/winml/lib/Api/ImageFeatureValue.h b/winml/lib/Api/ImageFeatureValue.h index 92f3cab43b432..83a21c8679cf3 100644 --- a/winml/lib/Api/ImageFeatureValue.h +++ b/winml/lib/Api/ImageFeatureValue.h @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #pragma once From f98f2af797623a12686295b17a2e82331dc16ffb Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 8 Aug 2023 01:18:23 -0700 Subject: [PATCH 71/76] WIP --- .../onnxruntime/core/framework/execution_provider.h | 7 +++++++ .../dml/DmlExecutionProvider/src/ExecutionProvider.h | 10 ++++++++++ winml/adapter/winml_adapter_execution_provider.cpp | 6 ++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index ea4f52f99649d..1a7e77cddee28 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -320,6 +320,13 @@ class IExecutionProvider { return default_device_; }; + /** + * Return the appropriate OrtDevice object given OrtMemType that can be used directly by external callers. + */ + virtual OrtDevice GetExternalOrtDeviceByMemType(OrtMemType mem_type) const { + return GetOrtDeviceByMemType(mem_type); + }; + /** * Create Preferred allocators for the current Execution Provider * This function is a stateless function which creates new instances of Allocator, without storing them in EP. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 74f56acb345ed..6ee1efc5df556 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -314,6 +314,16 @@ namespace Dml return m_impl->CreatePreferredAllocators(); } + virtual OrtDevice GetExternalOrtDeviceByMemType(OrtMemType mem_type) const final + { + if (mem_type == OrtMemType::OrtMemTypeDefault) + { + return OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, 0); + } + + return GetOrtDeviceByMemType(mem_type); + } + private: ComPtr m_impl; }; diff --git a/winml/adapter/winml_adapter_execution_provider.cpp b/winml/adapter/winml_adapter_execution_provider.cpp index 52dbf9710abc7..0d3ae2f0d5ac4 100644 --- a/winml/adapter/winml_adapter_execution_provider.cpp +++ b/winml/adapter/winml_adapter_execution_provider.cpp @@ -51,7 +51,9 @@ ORT_API_STATUS_IMPL( auto inference_session = reinterpret_cast<::onnxruntime::InferenceSession*>(session); const auto execution_provider = reinterpret_cast(provider); OrtMemoryInfo mem_info( - "", OrtAllocatorType::OrtDeviceAllocator, execution_provider->GetOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault) + "", + OrtAllocatorType::OrtDeviceAllocator, + execution_provider->GetExternalOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault) ); auto allocator_ptr = inference_session->GetAllocator(mem_info); *allocator = new (std::nothrow) OrtAllocatorWrapper(allocator_ptr); @@ -66,7 +68,7 @@ ORT_API_STATUS_IMPL(winmla::GetProviderMemoryInfo, _In_ OrtExecutionProvider* pr API_IMPL_BEGIN const auto execution_provider = reinterpret_cast(provider); - auto device = execution_provider->GetOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault); + auto device = execution_provider->GetExternalOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault); *memory_info = new (std::nothrow) OrtMemoryInfo("", ::OrtAllocatorType::OrtDeviceAllocator, device); if (*memory_info == nullptr) { return OrtApis::CreateStatus(ORT_FAIL, "Out of memory"); From c54b29547874f3d3fd747fcc8d082472e77610b7 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 8 Aug 2023 10:53:02 -0700 Subject: [PATCH 72/76] Address PR comments --- .../src/ExecutionProvider.cpp | 32 +-- .../src/Operators/DmlOperatorCopy.cpp | 7 - .../DmlExecutionProvider/src/ReadbackHeap.cpp | 11 +- .../DmlExecutionProvider/src/ReadbackHeap.h | 3 +- winml/lib/Common/inc/iengine.h | 2 +- winml/test/adapter/AdapterDmlEpTest.cpp | 201 ++++++++++-------- winml/test/adapter/AdapterSessionTest.cpp | 16 +- 7 files changed, 139 insertions(+), 133 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 5c5f8ebf2c3d1..3fdf031cbc0c2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -515,8 +515,7 @@ namespace Dml ORT_THROW_HR_IF(E_INVALIDARG, dst.size() != src.size()); // Source and destination for batched GPU -> CPU copies - std::vector srcDatas; - std::vector srcOffsets; + std::vector srcBufferRegions; std::vector dstDatas; std::vector dataSizesInBytes; @@ -545,19 +544,12 @@ namespace Dml ORT_THROW_HR_IF(E_INVALIDARG, dataSizesInBytes.back() != ComputeByteSizeFromTensor(*src[i])); // Tensors must be the same size dstDatas.push_back(dst[i]->GetData()); - - auto srcBufferRegion = GetBufferForTensor(src[i]); - - ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - - srcDatas.push_back(srcData); - srcOffsets.push_back(srcBufferRegion.Offset()); + srcBufferRegions.push_back(GetBufferForTensor(src[i])); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcBufferRegions, srcState); return S_OK; } @@ -924,11 +916,8 @@ namespace Dml Status ExecutionProviderImpl::CopyTensors(const std::vector& src_dst_pairs) const { // Source and destination for batched GPU -> CPU copies - std::vector srcDatas; - srcDatas.reserve(src_dst_pairs.size()); - - std::vector srcOffsets; - srcOffsets.reserve(src_dst_pairs.size()); + std::vector srcBufferRegions; + srcBufferRegions.reserve(src_dst_pairs.size()); std::vector dstDatas; dstDatas.reserve(src_dst_pairs.size()); @@ -973,19 +962,12 @@ namespace Dml ORT_THROW_HR_IF(E_INVALIDARG, dataSizesInBytes[i] != ComputeByteSizeFromTensor(srcWrapper)); // Tensors must be the same size dstDatas.push_back(dstWrapper.GetData()); - - auto srcBufferRegion = GetBufferForTensor(&srcWrapper); - - ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource(); - const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - - srcDatas.push_back(srcData); - srcOffsets.push_back(srcBufferRegion.Offset()); + srcBufferRegions.push_back(GetBufferForTensor(&srcWrapper)); } // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState); + m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcBufferRegions, srcState); return onnxruntime::common::Status::OK(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp index 8fa3c74674776..96fec218ed87e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp @@ -67,13 +67,6 @@ class DmlOperatorCopy : public DmlOperator inputTensor.GetInterface().Get())); } } - -private: - // Aliasing means that both the input and the output start at the same exact offset in the same buffer - bool m_aliasing = false; - - // The choice of using Identity or a copy depends on whether the input and the input are located in the same buffer - bool m_inputSharesOutputBuffer = false; }; DML_OP_DEFINE_CREATION_FUNCTION(Copy, DmlOperatorCopy); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp index 5bb04ba4d30b5..268ad9a2b7a86 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp @@ -104,12 +104,11 @@ namespace Dml void ReadbackHeap::ReadbackFromGpu( gsl::span dst, gsl::span dstSizes, - gsl::span src, - gsl::span srcOffsets, + gsl::span srcBufferRegions, D3D12_RESOURCE_STATES srcState) { - assert(dst.size() == src.size()); - assert(dstSizes.size() == src.size()); + assert(dst.size() == srcBufferRegions.size()); + assert(dstSizes.size() == srcBufferRegions.size()); if (dst.empty()) { @@ -132,8 +131,8 @@ namespace Dml m_readbackHeap.Get(), offset, D3D12_RESOURCE_STATE_COPY_DEST, - src[i], - srcOffsets[i], + srcBufferRegions[i].GetD3D12Resource(), + srcBufferRegions[i].Offset(), srcState, dstSizes[i]); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h index 4a65ce899d791..bbc46cd0e0cb9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h @@ -26,8 +26,7 @@ namespace Dml void ReadbackFromGpu( gsl::span dst, gsl::span dstSizes, - gsl::span src, - gsl::span srcOffsets, + gsl::span srcBufferRegions, D3D12_RESOURCE_STATES srcState); private: diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h index 1aa857383a3b5..4451382114905 100644 --- a/winml/lib/Common/inc/iengine.h +++ b/winml/lib/Common/inc/iengine.h @@ -12,7 +12,7 @@ interface IEngineFactory; using Resource = std::unique_ptr>; // clang-format off -MIDL_INTERFACE("31f39226-cfe8-4758-af38-3d01b2a33ee1") +MIDL_INTERFACE("8ac0b6b9-4561-492b-b63d-a07bdd8292c6") IValue : IUnknown { STDMETHOD(IsEmpty) (bool* out) PURE; diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp index 2b701d51aa73b..d8d5c708f3fb1 100644 --- a/winml/test/adapter/AdapterDmlEpTest.cpp +++ b/winml/test/adapter/AdapterDmlEpTest.cpp @@ -65,7 +65,7 @@ UniqueOrtSession CreateUniqueOrtSession( return UniqueOrtSession(session, ort_api->ReleaseSession); } -UniqueOrtSession CreateDmlSession() { +UniqueOrtSession CreateDmlSession(bool bfc_allocator_enabled) { const auto session_options = CreateUniqueOrtSessionOptions(); THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api); @@ -79,9 +79,10 @@ UniqueOrtSession CreateDmlSession() { command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put()))); + constexpr bool metacommands_enabled = false; THROW_IF_NOT_OK_MSG( winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( - session_options.get(), device.get(), queue.get(), false, true + session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled ), ort_api ); @@ -95,26 +96,35 @@ UniqueOrtSession CreateCpuSession() { void DmlExecutionProviderSetDefaultRoundingMode() { GPUTEST; - auto session = CreateDmlSession(); - OrtExecutionProvider* ort_provider; - THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); - THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderSetDefaultRoundingMode(ort_provider, false), ort_api); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + OrtExecutionProvider* ort_provider; + THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderSetDefaultRoundingMode(ort_provider, false), ort_api); + } } void DmlExecutionProviderFlushContext() { GPUTEST; - auto session = CreateDmlSession(); - OrtExecutionProvider* ort_provider; - THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); - THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderFlushContext(ort_provider), ort_api); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + OrtExecutionProvider* ort_provider; + THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderFlushContext(ort_provider), ort_api); + } } void DmlExecutionProviderReleaseCompletedReferences() { GPUTEST; - auto session = CreateDmlSession(); - OrtExecutionProvider* ort_provider; - THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); - THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderReleaseCompletedReferences(ort_provider), ort_api); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + OrtExecutionProvider* ort_provider; + THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderReleaseCompletedReferences(ort_provider), ort_api); + } } constexpr std::array dimensions{1, 3, 720, 720}; @@ -175,29 +185,32 @@ void DmlGetD3D12ResourceFromAllocation() { void* gpu_allocation; THROW_IF_NOT_OK_MSG(ort_dml_api->CreateGPUAllocationFromD3DResource(d3d12_resource.get(), &gpu_allocation), ort_api); - auto session = CreateDmlSession(); - - OrtMemoryInfo* ort_memory_info; - THROW_IF_NOT_OK_MSG( - ort_api->CreateMemoryInfo( - "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info - ), - ort_api - ); - - OrtAllocator* ort_allocator; - THROW_IF_NOT_OK_MSG(ort_api->CreateAllocator(session.get(), ort_memory_info, &ort_allocator), ort_api); - auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator); - - winrt::com_ptr d3d12_resource_from_allocation; - THROW_IF_NOT_OK_MSG( - ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()), - ort_api - ); - // Ensure resource is the same - WINML_EXPECT_EQUAL(d3d12_resource, d3d12_resource_from_allocation); - - THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + + OrtMemoryInfo* ort_memory_info; + THROW_IF_NOT_OK_MSG( + ort_api->CreateMemoryInfo( + "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info + ), + ort_api + ); + + OrtAllocator* ort_allocator; + THROW_IF_NOT_OK_MSG(ort_api->CreateAllocator(session.get(), ort_memory_info, &ort_allocator), ort_api); + auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator); + + winrt::com_ptr d3d12_resource_from_allocation; + THROW_IF_NOT_OK_MSG( + ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()), + ort_api + ); + // Ensure resource is the same + WINML_EXPECT_EQUAL(d3d12_resource, d3d12_resource_from_allocation); + + THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api); + } } UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) { @@ -219,28 +232,34 @@ UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) { void GetTensorMemoryInfo() { GPUTEST; - auto session = CreateDmlSession(); - - OrtMemoryInfo* ort_memory_info; - THROW_IF_NOT_OK_MSG( - ort_api->CreateMemoryInfo( - "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info - ), - ort_api - ); - auto tensor = CreateTensorFromMemoryInfo(ort_memory_info); - - const OrtMemoryInfo* value_memory_info; - THROW_IF_NOT_OK_MSG(ort_api->GetTensorMemoryInfo(tensor.get(), &value_memory_info), ort_api); - CreateTensorFromMemoryInfo(value_memory_info); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + + OrtMemoryInfo* ort_memory_info; + THROW_IF_NOT_OK_MSG( + ort_api->CreateMemoryInfo( + "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info + ), + ort_api + ); + auto tensor = CreateTensorFromMemoryInfo(ort_memory_info); + + const OrtMemoryInfo* value_memory_info; + THROW_IF_NOT_OK_MSG(ort_api->GetTensorMemoryInfo(tensor.get(), &value_memory_info), ort_api); + CreateTensorFromMemoryInfo(value_memory_info); + } } void ExecutionProviderSync() { GPUTEST; - auto session = CreateDmlSession(); - OrtExecutionProvider* ort_provider; - THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); - THROW_IF_NOT_OK_MSG(winml_adapter_api->ExecutionProviderSync(ort_provider), ort_api); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + OrtExecutionProvider* ort_provider; + THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); + THROW_IF_NOT_OK_MSG(winml_adapter_api->ExecutionProviderSync(ort_provider), ort_api); + } } void DmlCopyTensor() { @@ -258,9 +277,11 @@ void DmlCopyTensor() { command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put()))); + constexpr bool metacommands_enabled = false; + constexpr bool bfc_allocator_enabled = true; THROW_IF_NOT_OK_MSG( winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( - session_options.get(), device.get(), queue.get(), false, true + session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled ), ort_api ); @@ -322,41 +343,47 @@ void CreateCustomRegistry() { void ValueGetDeviceId() { GPUTEST; - auto session = CreateDmlSession(); - - OrtMemoryInfo* ort_memory_info; - THROW_IF_NOT_OK_MSG( - ort_api->CreateMemoryInfo( - "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info - ), - ort_api - ); - auto gpu_tensor = CreateTensorFromMemoryInfo(ort_memory_info); - - int16_t device_id; - THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(gpu_tensor.get(), &device_id), ort_api); - - OrtMemoryInfo* cpu_memory_info; - THROW_IF_NOT_OK_MSG(ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info), ort_api); - auto unique_cpu_memory_info = UniqueOrtMemoryInfo(cpu_memory_info, ort_api->ReleaseMemoryInfo); - auto cpu_tensor = CreateTensorFromMemoryInfo(unique_cpu_memory_info.get()); - THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(cpu_tensor.get(), &device_id), ort_api); - WINML_EXPECT_EQUAL(0, device_id); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + + OrtMemoryInfo* ort_memory_info; + THROW_IF_NOT_OK_MSG( + ort_api->CreateMemoryInfo( + "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info + ), + ort_api + ); + auto gpu_tensor = CreateTensorFromMemoryInfo(ort_memory_info); + + int16_t device_id; + THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(gpu_tensor.get(), &device_id), ort_api); + + OrtMemoryInfo* cpu_memory_info; + THROW_IF_NOT_OK_MSG(ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info), ort_api); + auto unique_cpu_memory_info = UniqueOrtMemoryInfo(cpu_memory_info, ort_api->ReleaseMemoryInfo); + auto cpu_tensor = CreateTensorFromMemoryInfo(unique_cpu_memory_info.get()); + THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(cpu_tensor.get(), &device_id), ort_api); + WINML_EXPECT_EQUAL(0, device_id); + } } void SessionGetInputRequiredDeviceId() { GPUTEST; - auto session = CreateDmlSession(); - int16_t device_id; - THROW_IF_NOT_OK_MSG( - winml_adapter_api->SessionGetInputRequiredDeviceId(session.get(), "inputImage", &device_id), ort_api - ); - - auto cpu_session = CreateCpuSession(); - THROW_IF_NOT_OK_MSG( - winml_adapter_api->SessionGetInputRequiredDeviceId(cpu_session.get(), "inputImage", &device_id), ort_api - ); - WINML_EXPECT_EQUAL(0, device_id); + for (bool bfc_allocator_enabled : {false, true}) + { + auto session = CreateDmlSession(bfc_allocator_enabled); + int16_t device_id; + THROW_IF_NOT_OK_MSG( + winml_adapter_api->SessionGetInputRequiredDeviceId(session.get(), "inputImage", &device_id), ort_api + ); + + auto cpu_session = CreateCpuSession(); + THROW_IF_NOT_OK_MSG( + winml_adapter_api->SessionGetInputRequiredDeviceId(cpu_session.get(), "inputImage", &device_id), ort_api + ); + WINML_EXPECT_EQUAL(0, device_id); + } } }// namespace diff --git a/winml/test/adapter/AdapterSessionTest.cpp b/winml/test/adapter/AdapterSessionTest.cpp index eb62c30fdeb8e..aaeb8a0b711d0 100644 --- a/winml/test/adapter/AdapterSessionTest.cpp +++ b/winml/test/adapter/AdapterSessionTest.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "testPch.h" @@ -103,9 +103,11 @@ void AppendExecutionProvider_DML() { const auto device = CreateD3DDevice(); const auto queue = CreateD3DQueue(device.get()); + constexpr bool metacommands_enabled = true; + constexpr bool bfc_allocator_enabled = true; THROW_IF_NOT_OK_MSG( winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( - session_options.get(), device.get(), queue.get(), true, true + session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled ), ort_api ); @@ -130,9 +132,11 @@ void GetExecutionProvider_DML() { THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api); const auto device = CreateD3DDevice(); const auto queue = CreateD3DQueue(device.get()); + constexpr bool metacommands_enabled = true; + constexpr bool bfc_allocator_enabled = true; THROW_IF_NOT_OK_MSG( winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( - session_options.get(), device.get(), queue.get(), true, true + session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled ), ort_api ); @@ -142,7 +146,7 @@ void GetExecutionProvider_DML() { OrtExecutionProvider* ort_provider; THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); - // Test if DML EP method can be called + // Test if DML EP method can be called THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderFlushContext(ort_provider), ort_api); } @@ -290,9 +294,11 @@ void CopyInputAcrossDevices_DML() { THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api); const auto device = CreateD3DDevice(); const auto queue = CreateD3DQueue(device.get()); + constexpr bool metacommands_enabled = true; + constexpr bool bfc_allocator_enabled = true; THROW_IF_NOT_OK_MSG( winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML( - session_options.get(), device.get(), queue.get(), true, true + session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled ), ort_api ); From 26b4e7e81cc61c073340be0a83d4a7348d6c5911 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 8 Aug 2023 17:21:32 -0700 Subject: [PATCH 73/76] Move allocation free outside of loop --- winml/test/adapter/AdapterDmlEpTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp index d8d5c708f3fb1..3b6888c3db576 100644 --- a/winml/test/adapter/AdapterDmlEpTest.cpp +++ b/winml/test/adapter/AdapterDmlEpTest.cpp @@ -208,9 +208,9 @@ void DmlGetD3D12ResourceFromAllocation() { ); // Ensure resource is the same WINML_EXPECT_EQUAL(d3d12_resource, d3d12_resource_from_allocation); - - THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api); } + + THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api); } UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) { From 163fe5b38d100ae490851d95a5bdf55d2253adfa Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 9 Aug 2023 11:31:21 -0700 Subject: [PATCH 74/76] Fix linting errors --- winml/test/adapter/AdapterDmlEpTest.cpp | 36 +++++++++++-------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp index 3b6888c3db576..6903e9f1eaca8 100644 --- a/winml/test/adapter/AdapterDmlEpTest.cpp +++ b/winml/test/adapter/AdapterDmlEpTest.cpp @@ -1,5 +1,5 @@ -// // Copyright (c) Microsoft Corporation. All rights reserved. - // // Licensed under the MIT License. +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #include "testPch.h" @@ -96,8 +96,7 @@ UniqueOrtSession CreateCpuSession() { void DmlExecutionProviderSetDefaultRoundingMode() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtExecutionProvider* ort_provider; THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); @@ -107,8 +106,7 @@ void DmlExecutionProviderSetDefaultRoundingMode() { void DmlExecutionProviderFlushContext() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtExecutionProvider* ort_provider; THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); @@ -118,8 +116,7 @@ void DmlExecutionProviderFlushContext() { void DmlExecutionProviderReleaseCompletedReferences() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtExecutionProvider* ort_provider; THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); @@ -185,8 +182,7 @@ void DmlGetD3D12ResourceFromAllocation() { void* gpu_allocation; THROW_IF_NOT_OK_MSG(ort_dml_api->CreateGPUAllocationFromD3DResource(d3d12_resource.get(), &gpu_allocation), ort_api); - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtMemoryInfo* ort_memory_info; @@ -203,7 +199,9 @@ void DmlGetD3D12ResourceFromAllocation() { winrt::com_ptr d3d12_resource_from_allocation; THROW_IF_NOT_OK_MSG( - ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()), + ort_dml_api->GetD3D12ResourceFromAllocation( + allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put() + ), ort_api ); // Ensure resource is the same @@ -232,8 +230,7 @@ UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) { void GetTensorMemoryInfo() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtMemoryInfo* ort_memory_info; @@ -253,8 +250,7 @@ void GetTensorMemoryInfo() { void ExecutionProviderSync() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtExecutionProvider* ort_provider; THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api); @@ -290,7 +286,7 @@ void DmlCopyTensor() { OrtExecutionProvider* dml_provider; THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &dml_provider), ort_api); - // CPU to CPU is not supported + // CPU to CPU is not supported OrtMemoryInfo* cpu_memory_info; THROW_IF_NOT_OK_MSG(ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info), ort_api); auto cpu_tensor = CreateTensorFromMemoryInfo(cpu_memory_info); @@ -299,7 +295,7 @@ void DmlCopyTensor() { nullptr, winml_adapter_api->DmlCopyTensor(dml_provider, cpu_tensor.get(), dst_cpu_tensor.get()) ); - // GPU to CPU + // GPU to CPU OrtMemoryInfo* ort_memory_info; THROW_IF_NOT_OK_MSG( ort_api->CreateMemoryInfo( @@ -343,8 +339,7 @@ void CreateCustomRegistry() { void ValueGetDeviceId() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); OrtMemoryInfo* ort_memory_info; @@ -370,8 +365,7 @@ void ValueGetDeviceId() { void SessionGetInputRequiredDeviceId() { GPUTEST; - for (bool bfc_allocator_enabled : {false, true}) - { + for (bool bfc_allocator_enabled : {false, true}) { auto session = CreateDmlSession(bfc_allocator_enabled); int16_t device_id; THROW_IF_NOT_OK_MSG( From e6ae0587905801833d855ef855428a2ef19f2292 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Tue, 15 Aug 2023 21:43:15 -0700 Subject: [PATCH 75/76] Address PR comments --- onnxruntime/core/framework/bfc_arena.cc | 3 +-- .../src/DmlCommandRecorder.cpp | 1 - ...h => DmlReservedResourceAllocatorWrapper.h} | 4 ++-- .../src/DmlReservedResourceSubAllocator.h | 4 ---- .../src/ExecutionProvider.cpp | 18 ++---------------- .../src/ExecutionProvider.h | 3 --- .../src/IExecutionProvider.h | 2 -- .../src/MLOperatorAuthorImpl.cpp | 4 +++- .../src/MLOperatorAuthorImpl.h | 8 +++----- .../MLOperatorAuthorPrivate.h | 5 ----- .../core/providers/dml/dml_provider_factory.cc | 1 - 11 files changed, 11 insertions(+), 42 deletions(-) rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{DmlBfcAllocator.h => DmlReservedResourceAllocatorWrapper.h} (81%) diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc index 99288e6167ca7..9d58bf52de3e6 100644 --- a/onnxruntime/core/framework/bfc_arena.cc +++ b/onnxruntime/core/framework/bfc_arena.cc @@ -41,8 +41,7 @@ BFCArena::BFCArena(std::unique_ptr resource_allocator, memory_limit_ = total_memory; stats_.bytes_limit = static_cast(total_memory); - arena_extend_strategy_ = arena_extend_strategy; - UpdateFirstAllocationShrinkageLogic(); + SetArenaExtendStrategy(arena_extend_strategy); // Create a bunch of bins of various good sizes. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 862884c22b08c..0d2e5d1740bcc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -26,7 +26,6 @@ void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) m_allocator = allocator; } - void DmlCommandRecorder::InitializeOperator( IDMLCompiledOperator* op, const DML_BINDING_DESC& persistentResourceBinding, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceAllocatorWrapper.h similarity index 81% rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceAllocatorWrapper.h index d8631c1e9c1d0..e92740e9ce907 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceAllocatorWrapper.h @@ -8,10 +8,10 @@ namespace Dml { - class DmlBfcAllocator : public onnxruntime::IAllocator + class DmlReservedResourceAllocatorWrapper : public onnxruntime::IAllocator { public: - DmlBfcAllocator(std::shared_ptr subAllocator) + DmlReservedResourceAllocatorWrapper(std::shared_ptr subAllocator) : onnxruntime::IAllocator( OrtMemoryInfo( onnxruntime::DML, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h index f1c71c6313dac..f705b1d3ca4b8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h @@ -10,10 +10,6 @@ namespace Dml { - class DmlReservedResourceSubAllocator; - class DmlReservedResourceSubAllocator; - struct TaggedPointer; - // An allocator that makes logically contiguous allocations backed by D3D heaps. // // Heaps must fit entirely in either local or non-local memory. Larger heaps diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 3e701832e3ca6..f94a03add7a2f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -22,7 +22,7 @@ #include "core/framework/bfc_arena.h" #include "DmlCommittedResourceWrapper.h" #include "DmlBufferRegion.h" -#include "DmlBfcAllocator.h" +#include "DmlReservedResourceAllocatorWrapper.h" #include "DmlGpuAllocator.h" #include "DmlBuffer.h" #include "DmlTaggedPointer.h" @@ -127,18 +127,6 @@ namespace Dml return tensorWrapper->GetBufferRegion(); } - ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept - { - ORT_TRY - { - return GetBufferForTensor(tensor).GetD3D12Resource(); - } - ORT_CATCH_GENERIC - { - return nullptr; - } - } - // ORT release pipelines agent pools do not have 19H1 SDK installed which defines D3D_FEATURE_LEVEL_1_0_CORE. // Once ORT/WinML github project can be built with VS2019, we can update these pools to use install the 19H1 SDK // using the command line installer tool with VS2019 @@ -199,10 +187,8 @@ namespace Dml static std::shared_ptr CreateBfcAllocator(std::shared_ptr subAllocator) { - auto device_allocator = std::make_unique(subAllocator); - auto bfcArena = std::make_unique( - std::move(device_allocator), + std::make_unique(subAllocator), onnxruntime::BFCArena::DEFAULT_MAX_MEM, onnxruntime::ArenaExtendStrategy::kSameAsRequested, onnxruntime::BFCArena::DEFAULT_INITIAL_CHUNK_SIZE_BYTES, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index ba825a9efa919..fb91a2ce44693 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -30,7 +30,6 @@ namespace Dml class PooledUploadHeap; class ReadbackHeap; class ExecutionContext; - class DmlReservedResourceSubAllocator; class BucketizedBufferAllocator; class DmlCpuAllocator; class ExecutionProvider; @@ -137,8 +136,6 @@ namespace Dml // Allocate a resource from pools. Releasing the returned buffer returns it to the pool. DmlBuffer ExecutionProviderImpl::AllocatePooledResource(size_t size, AllocatorRoundingMode roundingMode) const; - STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept final; - std::shared_ptr GetKernelRegistry() const { return m_kernelRegistry; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h index 967d0cb8e6ed6..8f44694dcf7e6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h @@ -69,8 +69,6 @@ namespace Dml STDMETHOD_(D3D12_COMMAND_LIST_TYPE, GetCommandListTypeForQueue)() const noexcept = 0; STDMETHOD_(void, Flush)() const noexcept = 0; - STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept = 0; - STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0; STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index 4b749acf4ae33..0ed3cf4005aa6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -557,7 +557,7 @@ namespace Windows::AI::MachineLearning::Adapter const AttributeMap* defaultAttributes, gsl::span requiredConstantCpuInputs, MLOperatorTensorGetter& constantInputGetter, - onnxruntime::OpKernelContext* kernelContext + const onnxruntime::OpKernelContext* kernelContext ) : OpNodeInfoWrapper(kerneInfo, inputShapeOverrides, defaultAttributes, requiredConstantCpuInputs, constantInputGetter, kernelContext), m_inferredOutputShapes(inferredOutputShapes), @@ -1806,6 +1806,8 @@ namespace Windows::AI::MachineLearning::Adapter { m_winmlProvider->GetABIExecutionInterfaceAndInvalidateState(isInternalOperator, m_abiExecutionObject.ReleaseAndGetAddressOf()); } + + TransitionResourcesForOperatorIfRequired(true); } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h index 4f982c80c4c5c..b382a42b39c42 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h @@ -176,7 +176,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable const AttributeMap* defaultAttributes, gsl::span requiredConstantCpuInputs, MLOperatorTensorGetter& constantInputGetter, - onnxruntime::OpKernelContext* kernelContext = nullptr + const onnxruntime::OpKernelContext* kernelContext = nullptr ) : m_impl(impl), m_kernelContext(kernelContext), @@ -245,7 +245,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable protected: // Lifetime is managed by the caller and guaranteed to outlive this class const onnxruntime::OpNodeProtoHelper* m_impl = nullptr; - mutable onnxruntime::OpKernelContext* m_kernelContext = nullptr; + const onnxruntime::OpKernelContext* m_kernelContext = nullptr; private: template @@ -304,8 +304,6 @@ class TensorWrapper : public WRL::Base, public Closable void* m_tensorData = nullptr; bool m_isDataInterface = false; - - ID3D12Resource* m_abiDataInterface; }; class OnnxTensorWrapper : public WRL::Base, public Closable @@ -362,7 +360,7 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper< const AttributeMap* defaultAttributes, gsl::span requiredConstantCpuInputs, MLOperatorTensorGetter& constantInputGetter, - onnxruntime::OpKernelContext* kernelContext = nullptr + const onnxruntime::OpKernelContext* kernelContext = nullptr ); // HasTensorShapeDescription returns false if and only if the kernel is registered using diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h index 9b4536b6218b2..9909be1f8337f 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h @@ -10,11 +10,6 @@ struct DML_INPUT_GRAPH_EDGE_DESC; struct DML_OUTPUT_GRAPH_EDGE_DESC; struct DML_INTERMEDIATE_GRAPH_EDGE_DESC; -namespace onnxruntime -{ - class TensorShape; -} - // Either nodesAsOpDesc or nodesAsIDMLOperator is present. // 1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC. // These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 1f30d7be5cf27..e4fdbdcb858c7 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -90,7 +90,6 @@ void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* f dml_provider_factory->SetBfcAllocatorEnabled(bfc_allocator_enabled); } - bool IsSoftwareAdapter(IDXGIAdapter1* adapter) { DXGI_ADAPTER_DESC1 desc; adapter->GetDesc1(&desc); From 01d9bd27dd7b368edc17012de4fe2262b9f08127 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Wed, 16 Aug 2023 07:02:39 -0700 Subject: [PATCH 76/76] Fix lint issues --- winml/adapter/winml_adapter_dml.cpp | 2 +- .../Api.Ort/OnnxruntimeDmlSessionBuilder.cpp | 4 +-- .../Api.Ort/OnnxruntimeDmlSessionBuilder.h | 2 +- .../lib/Api.Ort/OnnxruntimeEngineBuilder.cpp | 2 +- winml/lib/Api/LearningModelSession.cpp | 28 +++++++++---------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp index 18efff94c60c8..0c4c451f4ed39 100644 --- a/winml/adapter/winml_adapter_dml.cpp +++ b/winml/adapter/winml_adapter_dml.cpp @@ -70,7 +70,7 @@ Microsoft::WRL::ComPtr CreateDmlDevice(ID3D12Device* d3d12Device) { namespace onnxruntime { void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled); void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled); -} // namespace onnxruntime +} // namespace onnxruntime #endif // USE_DML diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp index d9f8880349755..9de5585e4ba78 100644 --- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api.Ort/pch.h" @@ -64,7 +64,7 @@ OnnxruntimeDmlSessionBuilder::CreateSessionOptions(OrtSessionOptions** options) winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_CPU(session_options.get(), use_arena), ort_api ); - // call release() so the underlying OrtSessionOptions object isn't freed + // call release() so the underlying OrtSessionOptions object isn't freed *options = session_options.release(); return S_OK; diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h index 659f936cbcfff..3b1ade796d80f 100644 --- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h +++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #pragma once diff --git a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp index e21f6836e7c4a..a055b1b02ef64 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api.Ort/pch.h" diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp index f362dbcded26f..922420d997f6e 100644 --- a/winml/lib/Api/LearningModelSession.cpp +++ b/winml/lib/Api/LearningModelSession.cpp @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. - // Licensed under the MIT License. +// Licensed under the MIT License. #include "lib/Api/pch/pch.h" @@ -20,8 +20,8 @@ static const auto c_enable_debug_output = L"EnableDebugOutput"; namespace guid_details { - // This GUID is to be used for delimiting ML-related categories of capturable work. - // {D113B493-BBA2-4993-8608-D706A73B91CE} +// This GUID is to be used for delimiting ML-related categories of capturable work. +// {D113B493-BBA2-4993-8608-D706A73B91CE} struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE")) __declspec(novtable ) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {}; } // namespace guid_details @@ -61,7 +61,7 @@ LearningModelSession::LearningModelSession( WINML_CATCH_ALL _winml::IModel* LearningModelSession::GetOptimizedModel() { - // Get the model proto + // Get the model proto auto should_close_model = session_options_ != nullptr && session_options_.CloseModelOnSessionCreation(); @@ -72,18 +72,18 @@ _winml::IModel* LearningModelSession::GetOptimizedModel(bool should_close_model) com_ptr<_winml::IModel> model; { - // Lock the model detach/copy since multiple threads can access concurrently + // Lock the model detach/copy since multiple threads can access concurrently CWinMLAutoLock lock(&session_creation_lock_); - // Throw if the model has been disposed and is not capable of creating - // new sessions. + // Throw if the model has been disposed and is not capable of creating + // new sessions. auto model_impl = model_.as(); WINML_THROW_HR_IF_TRUE_MSG(E_INVALIDARG, model_impl->IsDisposed(), "The model has been disposed."); model.attach(should_close_model ? model_impl->DetachModel() : model_impl->CloneModel()); } - // Ensure that the model is runnable on the device + // Ensure that the model is runnable on the device auto isFloat16Supported = device_.as()->GetD3DDeviceCache()->IsFloat16Supported(); if (!isFloat16Supported) { WINML_THROW_IF_FAILED(model->ModelEnsureNoFloat16()); @@ -92,13 +92,13 @@ _winml::IModel* LearningModelSession::GetOptimizedModel(bool should_close_model) } void LearningModelSession::Initialize() { - // Begin recording session creation telemetry + // Begin recording session creation telemetry _winmlt::TelemetryEvent session_creation_event(_winmlt::EventCategory::kSessionCreation); - // Get the optimized model proto from the learning model + // Get the optimized model proto from the learning model com_ptr<_winml::IModel> model; model.attach(GetOptimizedModel()); - // Create the session builder + // Create the session builder auto device_impl = device_.as(); auto model_impl = model_.as(); @@ -121,7 +121,7 @@ void LearningModelSession::Initialize() { auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads(); auto allow_spinning = device_impl->AllowSpinning(); - // Make onnxruntime apply the batch size override, if any + // Make onnxruntime apply the batch size override, if any if (session_options_) { if (session_options_.BatchSizeOverride() != 0) { WINML_THROW_IF_FAILED(engine_builder->SetBatchSizeOverride(session_options_.BatchSizeOverride())); @@ -130,7 +130,7 @@ void LearningModelSession::Initialize() { com_ptr session_options_impl = session_options_.as(); - // Make onnxruntime apply named dimension overrides, if any + // Make onnxruntime apply named dimension overrides, if any if (session_options_impl && session_options_impl->NamedDimensionOverrides().Size() > 0) { WINML_THROW_IF_FAILED(engine_builder->SetNamedDimensionOverrides(session_options_impl->NamedDimensionOverrides()) ); @@ -164,7 +164,7 @@ void LearningModelSession::Initialize() { com_ptr<_winml::IEngine> engine; WINML_THROW_IF_FAILED(engine_builder->CreateEngine(engine.put())); - // Register the custom operator registry + // Register the custom operator registry operator_registry_ = MLOperatorRegistry(model_impl->GetOperatorRegistry(), [](auto registry) { registry->Release(); }); WINML_THROW_IF_FAILED(engine->RegisterCustomRegistry(operator_registry_.get()));