From f5a87a4ab02bc15dbd07c366d916c7350a9cad29 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 17 Jan 2023 20:00:29 -0800
Subject: [PATCH 01/76] WIP

---
 .../src/BucketizedBufferAllocator.cpp         |  36 +-
 .../src/BucketizedBufferAllocator.h           |  22 +-
 .../src/DmlBufferRegion.cc                    | 120 +++++++
 .../src/DmlBufferRegion.h                     |  82 +++++
 .../src/DmlCommandRecorder.cpp                |  28 +-
 .../src/DmlCommittedResourceWrapper.h         |   4 +-
 .../src/DmlHeapAllocator.cpp                  | 317 ++++++++++++++++++
 .../src/DmlHeapAllocator.h                    | 134 ++++++++
 .../src/DmlReservedResourceWrapper.h          |  21 ++
 .../src/DmlResourceWrapper.h                  |   4 +-
 .../src/DmlTaggedPointer.cpp                  |  33 ++
 .../src/DmlTaggedPointer.h                    |  34 ++
 .../src/ExecutionProvider.cpp                 |  98 ++++--
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  10 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |   4 +-
 15 files changed, 878 insertions(+), 69 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 588c4ac391023..08bffae6e8a5b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -37,10 +37,6 @@ namespace Dml
     BucketizedBufferAllocator::BucketizedBufferAllocator(
         ID3D12Device* device,
         std::shared_ptr<ExecutionContext> context,
-        const D3D12_HEAP_PROPERTIES& heapProps,
-        D3D12_HEAP_FLAGS heapFlags,
-        D3D12_RESOURCE_FLAGS resourceFlags,
-        D3D12_RESOURCE_STATES initialState,
         std::unique_ptr<DmlSubAllocator>&& subAllocator
         )
         : onnxruntime::IAllocator(
@@ -51,10 +47,6 @@ namespace Dml
             )
         ),
         m_device(device),
-        m_heapProperties(heapProps),
-        m_heapFlags(heapFlags),
-        m_resourceFlags(resourceFlags),
-        m_initialState(initialState),
         m_context(context),
         m_subAllocator(std::move(subAllocator))
     {
@@ -133,7 +125,7 @@ namespace Dml
             resourceId = ++m_currentResourceId;
         }
 
-        assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize);
+        assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize);
         assert(resourceWrapper != nullptr);
 
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
@@ -174,7 +166,7 @@ namespace Dml
 
         // Free the resource to the pool if its size matches a bucket size
         gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
-        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
+        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResourceInUavState()->GetDesc().Width)
         {
             assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
 
@@ -188,9 +180,29 @@ namespace Dml
         {
             // Free the underlying allocation once queued work has completed.
 #ifdef _GAMING_XBOX
-            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResource()).Get());
+            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInUavState()).Get());
+
+            if (allocInfo->GetResourceInCopySrcState() != nullptr)
+            {
+                m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopySrcState()).Get());
+            }
+
+            if (allocInfo->GetResourceInCopyDstState() != nullptr)
+            {
+                m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopyDstState()).Get());
+            }
 #else
-            m_context->QueueReference(allocInfo->GetResource());
+            m_context->QueueReference(allocInfo->GetResourceInUavState());
+
+            if (allocInfo->GetResourceInCopySrcState() != nullptr)
+            {
+                m_context->QueueReference(allocInfo->GetResourceInCopySrcState());
+            }
+
+            if (allocInfo->GetResourceInCopyDstState() != nullptr)
+            {
+                m_context->QueueReference(allocInfo->GetResourceInCopyDstState());
+            }
 #endif
             allocInfo->DetachResourceWrapper();
         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 7e3471e276c0d..3d95bd029aad8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -46,9 +46,19 @@ namespace Dml
             return m_owner;
         }
 
-        ID3D12Resource* GetResource() const
+        ID3D12Resource* GetResourceInUavState() const
         {
-            return m_resourceWrapper->GetD3D12Resource();
+            return m_resourceWrapper->GetResourceInUavState();
+        }
+
+        ID3D12Resource* GetResourceInCopySrcState() const
+        {
+            return m_resourceWrapper->GetResourceInCopySrcState();
+        }
+
+        ID3D12Resource* GetResourceInCopyDstState() const
+        {
+            return m_resourceWrapper->GetResourceInCopyDstState();
         }
 
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
@@ -95,10 +105,6 @@ namespace Dml
         BucketizedBufferAllocator(
             ID3D12Device* device,
             std::shared_ptr<ExecutionContext> context,
-            const D3D12_HEAP_PROPERTIES& heapProps,
-            D3D12_HEAP_FLAGS heapFlags,
-            D3D12_RESOURCE_FLAGS resourceFlags,
-            D3D12_RESOURCE_STATES initialState,
             std::unique_ptr<DmlSubAllocator>&& subAllocator);
 
         // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc.
@@ -141,10 +147,6 @@ namespace Dml
         void FreeResource(void* p, uint64_t resourceId);
 
         ComPtr<ID3D12Device> m_device;
-        D3D12_HEAP_PROPERTIES m_heapProperties;
-        D3D12_HEAP_FLAGS m_heapFlags;
-        D3D12_RESOURCE_FLAGS m_resourceFlags;
-        D3D12_RESOURCE_STATES m_initialState;
 
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
new file mode 100644
index 0000000000000..8d6fbd0551083
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
@@ -0,0 +1,120 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlBufferRegion.h"
+
+namespace Dml
+{
+
+D3D12BufferRegion::D3D12BufferRegion(
+    uint64_t offset,
+    uint64_t size_in_bytes,
+    ID3D12Resource* resource_uav_state,
+    ID3D12Resource* resource_copy_src_state,
+    ID3D12Resource* resource_copy_dst_state)
+    : resource_uav_state_(resource_uav_state),
+      resource_copy_src_state_(resource_copy_src_state),
+      resource_copy_dst_state_(resource_copy_dst_state),
+      offset_(offset),
+      size_in_bytes_(size_in_bytes)
+{
+    // Get a raw pointer to the first non-null resource passed in. At least one
+    // resource must be provided.
+    first_valid_resource_ = resource_uav_state_;
+    if (!first_valid_resource_)
+    {
+        first_valid_resource_ = resource_copy_src_state_;
+    }
+    if (!first_valid_resource_)
+    {
+        first_valid_resource_ = resource_copy_dst_state_;
+    }
+    ORT_THROW_HR_IF(E_UNEXPECTED, first_valid_resource_ == nullptr);
+
+    // Regions cannot be empty.
+    ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ == 0);
+
+    // Regions cannot extend beyond the size of the resource.
+    uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
+    ORT_THROW_HR_IF(E_UNEXPECTED, offset_ >= buffer_size);
+    ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ > buffer_size - offset);
+
+    // All three resources, if provided, must be identical aside from state.
+    assert(
+        first_valid_resource_->GetDesc().Dimension ==
+        D3D12_RESOURCE_DIMENSION_BUFFER);
+    assert(
+        !resource_uav_state ||
+        (resource_uav_state->GetDesc().Dimension ==
+             D3D12_RESOURCE_DIMENSION_BUFFER &&
+         resource_uav_state->GetDesc().Width == buffer_size));
+    assert(
+        !resource_copy_src_state_ ||
+        (resource_copy_src_state_->GetDesc().Dimension ==
+             D3D12_RESOURCE_DIMENSION_BUFFER &&
+         resource_copy_src_state_->GetDesc().Width == buffer_size));
+    assert(
+        !resource_copy_dst_state_ ||
+        (resource_copy_dst_state_->GetDesc().Dimension ==
+             D3D12_RESOURCE_DIMENSION_BUFFER &&
+         resource_copy_dst_state_->GetDesc().Width == buffer_size));
+}
+
+D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
+{
+    std::swap(this->resource_uav_state_, that.resource_uav_state_);
+    std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+    std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+    std::swap(this->offset_, that.offset_);
+    std::swap(this->size_in_bytes_, that.size_in_bytes_);
+    std::swap(this->first_valid_resource_, that.first_valid_resource_);
+}
+
+D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
+{
+    std::swap(this->resource_uav_state_, that.resource_uav_state_);
+    std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+    std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+    std::swap(this->offset_, that.offset_);
+    std::swap(this->size_in_bytes_, that.size_in_bytes_);
+    std::swap(this->first_valid_resource_, that.first_valid_resource_);
+    return *this;
+}
+
+ID3D12Resource* D3D12BufferRegion::GetResourceInUavState() const
+{
+    return resource_uav_state_;
+}
+
+ID3D12Resource* D3D12BufferRegion::GetResourceInCopySrcState() const
+{
+    return resource_copy_src_state_;
+}
+
+ID3D12Resource* D3D12BufferRegion::GetResourceInCopyDstState() const
+{
+    return resource_copy_dst_state_;
+}
+
+uint64_t D3D12BufferRegion::Offset() const
+{
+    return first_valid_resource_ ? offset_ : 0;
+}
+
+uint64_t D3D12BufferRegion::SizeInBytes() const
+{
+    return first_valid_resource_ ? size_in_bytes_ : 0;
+}
+
+DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
+{
+    if (!resource_uav_state_)
+    {
+        return DML_BUFFER_BINDING{};
+    }
+
+    return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
+}
+
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
new file mode 100644
index 0000000000000..f8c1033261c56
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -0,0 +1,82 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace Dml
+{
+
+class D3D12HeapAllocator;
+
+// Represents a region of a D3D12 buffer resource. A buffer region has an
+// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in
+// bytes from the beginning of that buffer, and a size in bytes of the region.
+class D3D12BufferRegion
+{
+  public:
+    D3D12BufferRegion() = default;
+
+    // References a region of a buffer. The respective ID3D12Resource objects
+    // must be in the appropriate states. Each resource is optional, but if more
+    // than one are provided they must map to the same region of memory.
+    D3D12BufferRegion(
+        uint64_t offset,
+        uint64_t size_in_bytes,
+        ID3D12Resource* resource_uav_state,
+        ID3D12Resource* resource_copy_src_state,
+        ID3D12Resource* resource_copy_dst_state);
+
+    // Move-only
+    D3D12BufferRegion(const D3D12BufferRegion&) = delete;
+    D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete;
+    D3D12BufferRegion(D3D12BufferRegion&&);
+    D3D12BufferRegion& operator=(D3D12BufferRegion&&);
+
+    ID3D12Resource* GetResourceInUavState() const;
+
+    // NOTE: may be any state that is valid as a copy source (COPY_SRC,
+    // GENERIC_READ, or COMMON).
+    ID3D12Resource* GetResourceInCopySrcState() const;
+
+    ID3D12Resource* GetResourceInCopyDstState() const;
+
+    uint64_t Offset() const;
+    uint64_t SizeInBytes() const;
+
+    DML_BUFFER_BINDING GetBufferBinding() const;
+
+    explicit operator bool() const { return first_valid_resource_ != nullptr; }
+
+    // Creates a subregion at an offset from the start of this region. If no
+    // size is provided the region runs to the end of the current region.
+    inline D3D12BufferRegion Subregion(
+        uint64_t offset,
+        uint64_t size_in_bytes = 0) const
+    {
+        // start of subregion must be within current region
+        ORT_THROW_HR_IF(E_UNEXPECTED, offset >= size_in_bytes_);
+        size_in_bytes =
+            size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+        // end of subregion must be within current region
+        ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes > size_in_bytes_ - offset);
+
+        return D3D12BufferRegion(
+            offset_ + offset,
+            size_in_bytes,
+            resource_uav_state_,
+            resource_copy_src_state_,
+            resource_copy_dst_state_);
+    }
+
+  private:
+    ID3D12Resource* resource_uav_state_ = nullptr;
+    ID3D12Resource* resource_copy_src_state_ = nullptr;
+    ID3D12Resource* resource_copy_dst_state_ = nullptr;
+    uint64_t offset_ = 0;
+    uint64_t size_in_bytes_ = 0;
+
+    // Pointer to the first resource above that isn't null.
+    ID3D12Resource* first_valid_resource_ = nullptr;
+};
+
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 59ceecdc884d2..bd6a5c6b7aa17 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -10,7 +10,7 @@ using namespace Dml;
 
 DmlCommandRecorder::DmlCommandRecorder(
     ID3D12Device* d3dDevice,
-    IDMLDevice* dmlDevice, 
+    IDMLDevice* dmlDevice,
     std::shared_ptr<CommandQueue> commandQueue)
     : m_queue(std::move(commandQueue)),
       m_d3dDevice(d3dDevice),
@@ -67,7 +67,7 @@ void DmlCommandRecorder::InitializeOperator(
             ORT_THROW_HR(E_OUTOFMEMORY);
         }
 
-        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResource();
+        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState();
         allocator->Free(tempResourceHandle);
 
         // Bind the temporary resource.
@@ -143,7 +143,7 @@ void DmlCommandRecorder::ExecuteOperator(
             ORT_THROW_HR(E_OUTOFMEMORY);
         }
 
-        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResource();
+        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState();
         allocator->Free(tempResourceHandle);
 
         // Bind the temporary resource.
@@ -183,7 +183,7 @@ void DmlCommandRecorder::CopyBufferRegion(
     m_currentCommandList->CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount);
     m_operationsRecordedInCurrentCommandList = true;
 }
-    
+
 void DmlCommandRecorder::FillBufferWithPattern(
     ID3D12Resource* dstBuffer,
     gsl::span<const std::byte> value /* Data type agnostic value, treated as raw bits */)
@@ -250,11 +250,11 @@ void DmlCommandRecorder::ExecuteCommandList(
     _Outptr_ ID3D12Fence** fence,
     _Out_ uint64_t* completionValue
     )
-{    
+{
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
     if (m_operationsRecordedInCurrentCommandList)
-    {            
+    {
         m_pendingCommandLists.push_back(m_currentCommandList.Get());
         m_pendingCommandListsCacheable.push_back(true);
     }
@@ -290,16 +290,16 @@ void DmlCommandRecorder::ExecuteCommandList(
 }
 
 ComPtr<ID3D12GraphicsCommandList> DmlCommandRecorder::GetCommandList()
-{ 
+{
     // Assume operations are added by the caller after this returns
-    m_operationsRecordedInCurrentCommandList = true; 
-    return m_currentCommandList; 
+    m_operationsRecordedInCurrentCommandList = true;
+    return m_currentCommandList;
 }
 
 void DmlCommandRecorder::ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers)
 {
     m_currentCommandList->ResourceBarrier(gsl::narrow_cast<uint32_t>(barriers.size()), barriers.data());
-    m_operationsRecordedInCurrentCommandList = true; 
+    m_operationsRecordedInCurrentCommandList = true;
 }
 
 void DmlCommandRecorder::AddUAVBarrier()
@@ -307,7 +307,7 @@ void DmlCommandRecorder::AddUAVBarrier()
     #pragma warning(suppress: 6387)
     auto barrier = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
     m_currentCommandList->ResourceBarrier(1, &barrier);
-    m_operationsRecordedInCurrentCommandList = true; 
+    m_operationsRecordedInCurrentCommandList = true;
 }
 
 void DmlCommandRecorder::Open()
@@ -323,7 +323,7 @@ void DmlCommandRecorder::Open()
             m_queue->GetType(),
             allocator,
             nullptr,
-            IID_GRAPHICS_PPV_ARGS(m_currentCommandList.ReleaseAndGetAddressOf())));   
+            IID_GRAPHICS_PPV_ARGS(m_currentCommandList.ReleaseAndGetAddressOf())));
     }
     else
     {
@@ -338,7 +338,7 @@ void DmlCommandRecorder::CloseAndExecute()
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
     if (m_operationsRecordedInCurrentCommandList)
-    {            
+    {
         m_pendingCommandLists.push_back(m_currentCommandList.Get());
         m_pendingCommandListsCacheable.push_back(true);
     }
@@ -386,4 +386,4 @@ void DmlCommandRecorder::SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap)
         ID3D12DescriptorHeap* descriptorHeaps[] = { descriptorHeap };
         m_currentCommandList->SetDescriptorHeaps(ARRAYSIZE(descriptorHeaps), descriptorHeaps);
     }
-}
\ No newline at end of file
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index cae206b569170..e86ca4b52b4f2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -9,7 +9,9 @@ namespace Dml
     {
     public:
         DmlCommittedResourceWrapper(ComPtr<ID3D12Resource>&& d3d12Resource) : m_d3d12Resource(std::move(d3d12Resource)) {}
-        ID3D12Resource* GetD3D12Resource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetResourceInUavState() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetResourceInCopySrcState() const final { return nullptr; }
+        ID3D12Resource* GetResourceInCopyDstState() const final { return nullptr; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
new file mode 100644
index 0000000000000..f56312b8ea2cf
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
@@ -0,0 +1,317 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlHeapAllocator.h"
+#include "DmlTaggedPointer.h"
+#include "DmlBufferRegion.h"
+#include "DmlReservedResourceWrapper.h"
+
+namespace Dml
+{
+
+static bool GetTilingEnabled(ID3D12Device* device)
+{
+    D3D12_FEATURE_DATA_D3D12_OPTIONS options = {};
+    if (SUCCEEDED(device->CheckFeatureSupport(
+            D3D12_FEATURE_D3D12_OPTIONS,
+            &options,
+            sizeof(options))))
+    {
+        return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1;
+    }
+
+    return false;
+}
+
+static uint64_t GetMaxHeapSizeInTiles()
+{
+    return D3D12HeapAllocator::kDefaultMaxHeapSizeInTiles;
+}
+
+D3D12HeapAllocator::D3D12HeapAllocator(
+    ID3D12Device* device,
+    ID3D12CommandQueue* queue,
+    const D3D12_HEAP_PROPERTIES& heap_props,
+    D3D12_HEAP_FLAGS heap_flags,
+    D3D12_RESOURCE_FLAGS resource_flags,
+    D3D12_RESOURCE_STATES initial_state)
+    : device_(device),
+      queue_(queue),
+      heap_properties_(heap_props),
+      heap_flags_(heap_flags),
+      resource_flags_(resource_flags),
+      initial_state_(initial_state),
+      tiling_enabled_(GetTilingEnabled(device)),
+      max_heap_size_in_tiles_(GetMaxHeapSizeInTiles())
+{
+}
+
+absl::optional<Allocation> D3D12HeapAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes)
+{
+    Allocation allocation = {};
+
+    // The allocation may be larger than the requested size to ensure a whole
+    // number of tiles.
+    const uint64_t resource_size_in_tiles =
+        1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+    const uint64_t resource_size_in_bytes =
+        resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+    auto resource_desc =
+        CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_);
+
+    ID3D12Resource** resources[] = {
+        &allocation.resource_uav_state,
+        &allocation.resource_copy_src_state,
+        &allocation.resource_copy_dst_state};
+
+    D3D12_RESOURCE_STATES states[] = {
+        initial_state_,
+        D3D12_RESOURCE_STATE_COPY_SOURCE,
+        D3D12_RESOURCE_STATE_COPY_DEST};
+
+    for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
+    {
+        HRESULT create_resource_hr = device_->CreateReservedResource(
+            &resource_desc,
+            states[i],
+            nullptr,
+            IID_PPV_ARGS(resources[i]));
+
+        if (create_resource_hr == E_OUTOFMEMORY)
+        {
+            return absl::nullopt;
+        }
+        ORT_THROW_IF_FAILED(create_resource_hr);
+    }
+
+    // Reserve enough heaps to store all tiles in the resource.
+    const uint64_t heap_count =
+        1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_;
+    allocation.heaps.resize(heap_count);
+
+    // Create heaps and map them to the primary reserved resource.
+    D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {};
+    uint64_t unmapped_resource_tiles = resource_size_in_tiles;
+    for (uint64_t i = 0; i < heap_count; i++)
+    {
+        // Create heap. The last heap of the allocation may have fewer tiles to
+        // avoid wasting space.
+        uint64_t heap_size_in_tiles = std::min<uint64_t>(
+            unmapped_resource_tiles,
+            max_heap_size_in_tiles_);
+        uint64_t heap_size_in_bytes =
+            heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        auto heap_desc = CD3DX12_HEAP_DESC(
+            heap_size_in_bytes,
+            heap_properties_,
+            0,
+            heap_flags_);
+
+        HRESULT create_heap_hr =
+            device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
+        if (create_heap_hr == E_OUTOFMEMORY)
+        {
+            return absl::nullopt;
+        }
+        ORT_THROW_IF_FAILED(create_heap_hr);
+
+        // Source region in the resource to map.
+        D3D12_TILE_REGION_SIZE resource_region_size = {};
+        resource_region_size.NumTiles = static_cast<uint32_t>(heap_size_in_tiles);
+
+        // Target range in the current heap to map.
+        const D3D12_TILE_RANGE_FLAGS tile_range_flags =
+            D3D12_TILE_RANGE_FLAG_NONE;
+        const uint32_t heap_range_start_offset = 0;
+        const uint32_t heap_range_tile_count = static_cast<uint32_t>(heap_size_in_tiles);
+
+        constexpr uint32_t numResourceRegions = 1;
+        constexpr uint32_t numHeapRanges = 1;
+
+        // This is a brand new allocation/resource, so the tile mappings are
+        // guaranteed to be set (on the GPU timeline) by the time any code can
+        // reference the returned resource. We only execute operations on a
+        // single hardware queue so there is no need to wait or signal.
+        //
+        // All resources have identical tile mappings. The repeated call to
+        // UpdateTileMappings on all resources instead of using CopyTileMappings
+        // is intentional: the latter API is not supported by all versions of
+        // PIX.
+        for (auto resource :
+             {allocation.resource_uav_state.Get(),
+              allocation.resource_copy_src_state.Get(),
+              allocation.resource_copy_dst_state.Get()})
+        {
+            queue_->UpdateTileMappings(
+                resource,
+                numResourceRegions,
+                &resource_region_start_coordinates,
+                &resource_region_size,
+                allocation.heaps[i].Get(),
+                numHeapRanges,
+                &tile_range_flags,
+                &heap_range_start_offset,
+                &heap_range_tile_count,
+                D3D12_TILE_MAPPING_FLAG_NONE);
+        }
+
+        resource_region_start_coordinates.X += static_cast<uint32_t>(heap_size_in_tiles);
+        unmapped_resource_tiles -= heap_size_in_tiles;
+    }
+
+    assert(unmapped_resource_tiles == 0);
+
+    return allocation;
+}
+
+absl::optional<Allocation> D3D12HeapAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes)
+{
+    Allocation allocation = {};
+
+    // Create the allocation's sole heap. The allocation may be larger than the
+    // requested size to ensure a whole number of tiles.
+    allocation.heaps.resize(1);
+    D3D12_HEAP_DESC heap_desc =
+        CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_);
+    HRESULT create_heap_hr = device_->CreateHeap(
+        &heap_desc,
+        IID_PPV_ARGS(&allocation.heaps.front()));
+    if (create_heap_hr == E_OUTOFMEMORY)
+    {
+        return absl::nullopt;
+    }
+
+    // Create large placed resource that spans the heap.
+    D3D12_RESOURCE_DESC resource_desc =
+        CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_);
+
+    ID3D12Resource** resources[] = {
+        &allocation.resource_uav_state,
+        &allocation.resource_copy_src_state,
+        &allocation.resource_copy_dst_state};
+    D3D12_RESOURCE_STATES states[] = {
+        initial_state_,
+        D3D12_RESOURCE_STATE_COPY_SOURCE,
+        D3D12_RESOURCE_STATE_COPY_DEST};
+
+    for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
+    {
+        HRESULT create_resource_hr = device_->CreatePlacedResource(
+            allocation.heaps.front().Get(),
+            0,
+            &resource_desc,
+            states[i],
+            nullptr,
+            IID_PPV_ARGS(resources[i]));
+        if (create_resource_hr == E_OUTOFMEMORY)
+        {
+            return absl::nullopt;
+        }
+        ORT_THROW_IF_FAILED(create_resource_hr);
+    }
+
+    return allocation;
+}
+
+Microsoft::WRL::ComPtr<DmlResourceWrapper> D3D12HeapAllocator::Alloc(size_t size_in_bytes)
+{
+    if (size_in_bytes == 0)
+    {
+        return nullptr;
+    }
+
+    // The D3D12 device is thread-safe so we don't need to hold the lock while
+    // creating an allocation.
+    absl::optional<Allocation> allocation =
+        tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes)
+                        : TryCreateUntiledAllocation(size_in_bytes);
+
+    ORT_THROW_HR_IF(E_UNEXPECTED, !allocation);
+
+    auto reservedResourceWrapper = wil::MakeOrThrow<DmlReservedResourceWrapper>(std::move(*allocation));
+    Microsoft::WRL::ComPtr<DmlResourceWrapper> resourceWrapper;
+    reservedResourceWrapper.As(&resourceWrapper);
+    return resourceWrapper;
+}
+
+void D3D12HeapAllocator::Free(void* ptr, uint64_t size_in_bytes)
+{
+    ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr);
+
+    TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
+    ORT_THROW_HR_IF(E_UNEXPECTED, tagged_ptr.offset != 0);
+
+    // We need to access (mutable) state after this point, so we need to lock
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+
+    ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end());
+
+    ReleaseAllocationID(tagged_ptr.allocation_id);
+
+    // Frees the ID3D12Heap
+    allocations_by_id_.erase(it);
+}
+
+D3D12BufferRegion D3D12HeapAllocator::CreateBufferRegion(
+    const void* ptr,
+    uint64_t size_in_bytes)
+{
+    ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr);
+
+    TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
+
+    // We need to access (mutable) state after this point, so we need to lock
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    // Find the allocation corresponding to this pointer
+    auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+    ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end());
+
+    Allocation* allocation = &it->second;
+
+    return D3D12BufferRegion(
+        tagged_ptr.offset,
+        size_in_bytes,
+        allocation->resource_uav_state.Get(),
+        allocation->resource_copy_src_state.Get(),
+        allocation->resource_copy_dst_state.Get());
+}
+
+absl::optional<uint32_t> D3D12HeapAllocator::TryReserveAllocationID()
+{
+    // The mutex must already be held
+    assert(!mutex_.try_lock());
+
+    if (!free_allocation_ids_.empty())
+    {
+        // Return a free ID from the pool
+        uint32_t id = free_allocation_ids_.back();
+        free_allocation_ids_.pop_back();
+        return id;
+    }
+
+    static constexpr uint32_t kMaxAllocationID =
+        (1 << TaggedPointer::kAllocationIDBits) - 1;
+    if (current_allocation_id_ == kMaxAllocationID)
+    {
+        // We've reached the maximum number of allocations!
+        return absl::nullopt;
+    }
+
+    ++current_allocation_id_;
+    return current_allocation_id_;
+}
+
+void D3D12HeapAllocator::ReleaseAllocationID(uint32_t id)
+{
+    // The mutex must already be held
+    assert(!mutex_.try_lock());
+
+    // Add it to the pool of free IDs
+    free_allocation_ids_.push_back(id);
+}
+
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
new file mode 100644
index 0000000000000..877e4b34be6ac
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
@@ -0,0 +1,134 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "absl/container/flat_hash_map.h"
+#include "DmlSubAllocator.h"
+#include "DmlBufferRegion.h"
+
+namespace Dml
+{
+
+struct Allocation
+{
+    Microsoft::WRL::ComPtr<ID3D12Heap> heap;
+
+    // Heaps backing the memory for the allocation. If tiling is supported
+    // an allocation may comprise multiple heaps. If tiling is not supported
+    // an allocation will only have a single heap.
+    std::vector<Microsoft::WRL::ComPtr<ID3D12Heap>> heaps;
+
+    // Resources created over this allocation's heaps. All three resources
+    // are identical aside from being fixed in a single resource state: UAV,
+    // COPY_SRC, and COPY_DST respectively. The purpose of duplicate
+    // resources is to enable overlapping resources in different states for
+    // copying data. Most callers will not (and should not) interact
+    // directly with these resources; all three are wrapped by the buffer
+    // regions returned from this allocator, and the appropriate resource
+    // will be used automatically when performing buffer copies.
+    Microsoft::WRL::ComPtr<ID3D12Resource> resource_uav_state;
+    Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_src_state;
+    Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_dst_state;
+};
+
+// An allocator that makes logically contiguous allocations backed by D3D heaps.
+//
+// Heaps must fit entirely in either local or non-local memory. Larger heaps
+// have a greater chance of getting demoted into non-local memory, which can be
+// disastrous for performance. This problem is compounded by the fact that heaps
+// may be demoted even if overall local memory usage is within the process'
+// budget. Heaps are not necessarily mappable to discontiguous regions of
+// physical memory, which means physical memory fragmentation *may* make it
+// extremely difficult to accommodate larger heaps.
+//
+// On D3D hardware that supports tiled resource tier 1+ this class implements
+// large allocations through tiling. Each allocation is backed by however many
+// small heaps are necessary to cover the requested allocation size. Buffer
+// regions retrieved through this allocator are reserved resources that span the
+// full collection of heaps assigned to an individual allocation. Tile mappings
+// are static.
+//
+// On hardware that doesn't support tiled resources each allocation is backed by
+// a single heap. Buffer regions retrieved through this allocator are placed
+// resources that span the full heap assigned to an individual allocation. In
+// this case it is better make more but smaller allocations (resulting in
+// smaller heaps); this fallback path is only retained as a last resort for
+// older hardware.
+class D3D12HeapAllocator : public DmlSubAllocator
+{
+  public:
+    // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
+    // is 64KB. A default size of 512 tiles (32MB) does a good job of handling
+    // local video memory fragmentation without requiring lots of heaps.
+    static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512;
+
+    // The largest single allocation supported by this allocator. We use 4GB
+    // minus a MB to avoid edge cases in hw/drivers that aren't expecting such
+    // large allocations.
+    static constexpr uint64_t kDefaultMaxAllocationSizeInBytes =
+        (1ull << 32) - (1ull << 20);
+
+    D3D12HeapAllocator(
+        ID3D12Device* device,
+        ID3D12CommandQueue* queue,
+        const D3D12_HEAP_PROPERTIES& heap_props,
+        D3D12_HEAP_FLAGS heap_flags,
+        D3D12_RESOURCE_FLAGS resource_flags,
+        D3D12_RESOURCE_STATES initial_state);
+
+    // Creates a reserved or placed resource buffer over the given memory range.
+    // The physical D3D12 resource may be larger than the requested size, so
+    // callers must ensure to use the offset/size returned in the
+    // D3D12BufferRegion else risk out of bounds access. Note that in practice
+    // the ID3D12Resource is cached, so this call typically has a lower cost
+    // than a call to ID3D12Device::CreatePlacedResource or
+    // CreateReservedResource.
+    D3D12BufferRegion CreateBufferRegion(
+        const void* ptr,
+        uint64_t size_in_bytes);
+
+    Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size_in_bytes) final;
+    void Free(void* ptr, uint64_t size_in_bytes);
+    bool TilingEnabled() const { return tiling_enabled_; };
+
+  private:
+    std::mutex mutex_;
+
+    Microsoft::WRL::ComPtr<ID3D12Device> device_;
+    Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue_;
+    const D3D12_HEAP_PROPERTIES heap_properties_;
+    const D3D12_HEAP_FLAGS heap_flags_;
+    const D3D12_RESOURCE_FLAGS resource_flags_;
+    const D3D12_RESOURCE_STATES initial_state_;
+    bool tiling_enabled_;
+    uint64_t max_heap_size_in_tiles_;
+
+    // The largest allocation ID we've returned so far (or 0 if we've never done
+    // so). Note that our allocation IDs start at 1 (not 0) to ensure that it
+    // isn't possible for a valid allocation to have a pointer value of
+    // 0x00000000.
+    uint32_t current_allocation_id_ = 0;
+
+    // A list of unused allocation IDs. This is for re-use of IDs once they get
+    // freed. We only bump the max_allocation_id_ once there are no more free
+    // IDs.
+    std::vector<uint32_t> free_allocation_ids_;
+
+    absl::flat_hash_map<uint32_t, Allocation> allocations_by_id_;
+
+    // Retrieves a free allocation ID, or nullopt if no more IDs are available.
+    absl::optional<uint32_t> TryReserveAllocationID();
+
+    // Releases an allocation ID back to the pool of IDs.
+    void ReleaseAllocationID(uint32_t id);
+
+  private:
+    absl::optional<Allocation> TryCreateTiledAllocation(uint64_t size_in_bytes);
+    absl::optional<Allocation> TryCreateUntiledAllocation(
+        uint64_t size_in_bytes);
+
+    friend class D3D12BufferRegion;
+};
+
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
new file mode 100644
index 0000000000000..9d52c4e8c0445
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "DmlResourceWrapper.h"
+#include "DmlBufferRegion.h"
+#include "DmlHeapAllocator.h"
+
+namespace Dml
+{
+    class DmlReservedResourceWrapper : public Microsoft::WRL::RuntimeClass<Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, DmlResourceWrapper>
+    {
+    public:
+        DmlReservedResourceWrapper(Allocation&& allocation) : m_allocation(std::move(allocation)) {}
+        ID3D12Resource* GetResourceInUavState() const final { return m_allocation.resource_uav_state.Get(); }
+        ID3D12Resource* GetResourceInCopySrcState() const final { return m_allocation.resource_copy_src_state.Get(); }
+        ID3D12Resource* GetResourceInCopyDstState() const final { return m_allocation.resource_copy_dst_state.Get(); }
+
+    private:
+        Allocation m_allocation;
+    };
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 876487242aa37..e600cee0589d0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,7 +11,9 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
-        virtual ID3D12Resource* GetD3D12Resource() const = 0;
+        virtual ID3D12Resource* GetResourceInUavState() const = 0;
+        virtual ID3D12Resource* GetResourceInCopySrcState() const = 0;
+        virtual ID3D12Resource* GetResourceInCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
new file mode 100644
index 0000000000000..ba3f4cb85697e
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlTaggedPointer.h"
+#include <cassert>
+
+namespace Dml
+{
+/*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr)
+{
+    uint64_t ptr_val = reinterpret_cast<uint64_t>(ptr);
+
+    static constexpr uint64_t kAllocationIDMask =
+        (1ull << kAllocationIDBits) - 1;
+    static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1;
+
+    TaggedPointer tagged_ptr;
+    tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask;
+    tagged_ptr.offset = (ptr_val & kOffsetMask);
+
+    return tagged_ptr;
+}
+
+/*static*/ void* TaggedPointer::Pack(uint32_t allocation_id, uint64_t offset)
+{
+    assert(allocation_id < (1ull << kAllocationIDBits));
+    assert(offset < (1ull << kOffsetBits));
+    uint64_t ptr = ((uint64_t)allocation_id << kOffsetBits) | offset;
+
+    return reinterpret_cast<void*>(ptr);
+}
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
new file mode 100644
index 0000000000000..a161007a138ea
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <climits>
+#include <cstdint>
+
+namespace Dml
+{
+
+// D3D12HeapAllocator and D3D12DescriptorHeapAllocator encode the allocation ID
+// into the high bits of the pointers it returns, while the low bits are used as
+// an offset into the allocation. Note that since the layout of bitfields is
+// implementation-defined, you can't just cast a void* into a TaggedPointer: it
+// must be done using masks and shifts.
+struct TaggedPointer
+{
+    static constexpr uint64_t kAllocationIDBits = 24;
+    static constexpr uint64_t kOffsetBits = 40;
+
+    uint64_t allocation_id : kAllocationIDBits;
+    uint64_t offset : kOffsetBits;
+
+    static void* Pack(uint32_t allocation_id, uint64_t offset);
+    static TaggedPointer Unpack(const void* ptr);
+};
+
+static_assert(
+    sizeof(TaggedPointer) == sizeof(void*),
+    "DML requires a 64-bit architecture");
+static_assert(TaggedPointer::kAllocationIDBits + TaggedPointer::kOffsetBits == sizeof(void*) * CHAR_BIT,
+    "DML requires a 64-bit architecture");
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 3ae8e1483141c..6dc6f046727ab 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -17,8 +17,8 @@
 #include "core/graph/indexed_sub_graph.h"
 #include "core/framework/compute_capability.h"
 #include "core/framework/fallback_cpu_capability.h"
-#include "DmlCommittedResourceAllocator.h"
 #include "DmlCommittedResourceWrapper.h"
+#include "DmlHeapAllocator.h"
 
 #ifdef ERROR
 #undef ERROR
@@ -123,7 +123,7 @@ namespace Dml
 
         const auto* allocInfo = m_allocator->DecodeDataHandle(allocation.Get());
 
-        ComPtr<ID3D12Resource> resource = allocInfo->GetResource();
+        ComPtr<ID3D12Resource> resource = allocInfo->GetResourceInUavState();
         resource.CopyTo(d3dResource);
         *pooledResource = allocation.Detach();
         return S_OK;
@@ -136,7 +136,7 @@ namespace Dml
         ORT_TRY
         {
             const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(allocation);
-            return allocInfo->GetResource();
+            return allocInfo->GetResourceInUavState();
         }
         ORT_CATCH_GENERIC
         {
@@ -178,16 +178,20 @@ namespace Dml
 
         m_context = std::make_shared<ExecutionContext>(m_d3d12Device.Get(), m_dmlDevice.Get(), queue);
 
+        auto heapAllocator = std::make_unique<D3D12HeapAllocator>(
+            m_d3d12Device.Get(),
+            queue,
+            CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+            D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS,
+            D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+
         // Create an allocator for D3D12 buffers used to hold tensor data. The returned buffers from the allocator
         // should be DEFAULT heap buffers which can be used as UAVs, and which start in UAV state.
         m_allocator = std::make_shared<BucketizedBufferAllocator>(
             m_d3d12Device.Get(),
             m_context,
-            CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
-            D3D12_HEAP_FLAG_NONE,
-            D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            std::make_unique<DmlCommittedResourceAllocator>(m_d3d12Device.Get()));
+            std::move(heapAllocator));
 
         m_context->SetAllocator(m_allocator);
 
@@ -338,7 +342,7 @@ namespace Dml
                 {
                     assert(tensor->IsDataInterface());
                     const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(tensor).GetDataInterface().Get());
-                    ID3D12Resource* resource = allocInfo->GetResource();
+                    ID3D12Resource* resource = allocInfo->GetResourceInUavState();
                     D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
                     bufferBindings.push_back({ resource, 0, resourceDesc.Width });
                     bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
@@ -429,12 +433,19 @@ namespace Dml
             //
             const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get());
 
-            ID3D12Resource* dstData = dstAllocInfo->GetResource();
+            ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr
+                ? dstAllocInfo->GetResourceInUavState()
+                : dstAllocInfo->GetResourceInCopyDstState();
+
+            // When resources in dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise,
+            // take the slower path of adding a barrier (e.g. committed resources).
+            const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
+
             const void* srcData = src->GetData();
 
             const uint64_t dstOffset = 0;
-            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // GPU resources are always kept in UAV state
-
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(srcData, dataSizeInBytes));
         }
         else if (!src->IsCpuData() && dst->IsCpuData())
@@ -446,10 +457,17 @@ namespace Dml
             void* dstData = dst->GetData();
             const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get());
 
-            ID3D12Resource* srcData = srcAllocInfo->GetResource();
+            ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr
+                ? srcAllocInfo->GetResourceInUavState()
+                : srcAllocInfo->GetResourceInCopySrcState();
+
+            // When resources in src state exist (e.g. reserved resources), we can avoid barriers. Otherwise,
+            // take the slower path of adding a barrier (e.g. committed resources).
+            const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             const uint64_t srcOffset = 0;
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // GPU resources are always kept in UAV state
 
             // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dstData, dataSizeInBytes), srcData, srcOffset, srcState);
@@ -462,9 +480,25 @@ namespace Dml
             const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get());
             const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get());
 
-            ID3D12Resource* srcData = srcAllocInfo->GetResource();
-            ID3D12Resource* dstData = dstAllocInfo->GetResource();
-            m_context->CopyBufferRegion(dstData, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, srcData, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, dataSizeInBytes);
+            ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr
+                ? srcAllocInfo->GetResourceInUavState()
+                : srcAllocInfo->GetResourceInCopySrcState();
+
+            ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr
+                ? dstAllocInfo->GetResourceInUavState()
+                : dstAllocInfo->GetResourceInCopyDstState();
+
+            // When resources in src and dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise,
+            // take the slower path of adding a barrier (e.g. committed resources).
+            const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+
+            const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
+
+            m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes);
         }
         else
         {
@@ -488,7 +522,7 @@ namespace Dml
         if (mlTensor != nullptr)
         {
             const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(mlTensor.Get());
-            ID3D12Resource* dstData = dstAllocInfo->GetResource();
+            ID3D12Resource* dstData = dstAllocInfo->GetResourceInUavState();
             m_context->FillBufferWithPattern(dstData, rawValue);
         }
 
@@ -734,8 +768,16 @@ namespace Dml
     {
         // Source and destination for batched GPU -> CPU copies
         std::vector<ID3D12Resource*> srcDatas;
+        srcDatas.reserve(src_dst_pairs.size());
+
+        std::vector<D3D12_RESOURCE_STATES> srcStates;
+        srcStates.reserve(src_dst_pairs.size());
+
         std::vector<void*> dstDatas;
+        dstDatas.reserve(src_dst_pairs.size());
+
         std::vector<uint32_t> dataSizesInBytes;
+        dataSizesInBytes.reserve(src_dst_pairs.size());
 
         assert(!m_closed);
         auto provider = const_cast<ExecutionProviderImpl*>(this);
@@ -776,14 +818,22 @@ namespace Dml
             dstDatas.push_back(dstWrapper.GetData());
             const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(&srcWrapper).GetDataInterface().Get());
 
-            srcDatas.push_back(srcAllocInfo->GetResource());
+            auto srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr
+                ? srcAllocInfo->GetResourceInUavState()
+                : srcAllocInfo->GetResourceInCopySrcState();
+
+            auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+
+            srcDatas.push_back(srcData);
+            srcStates.push_back(srcState);
         }
 
         const uint64_t srcOffset = 0;
-        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // GPU resources are always kept in UAV state
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcStates);
 
         return onnxruntime::common::Status::OK();
     }
@@ -836,10 +886,10 @@ namespace Dml
         else
         {
 #ifdef _GAMING_XBOX
-            ComPtr<GraphicsUnknownWrapper> wrappedResource = Microsoft::WRL::Make<GraphicsUnknownWrapper>(m_allocator->DecodeDataHandle(data)->GetResource());
+            ComPtr<GraphicsUnknownWrapper> wrappedResource = Microsoft::WRL::Make<GraphicsUnknownWrapper>(m_allocator->DecodeDataHandle(data)->GetResourceInUavState());
             *abiData = wrappedResource.Detach();
 #else
-            ComPtr<ID3D12Resource> resource = m_allocator->DecodeDataHandle(data)->GetResource();
+            ComPtr<ID3D12Resource> resource = m_allocator->DecodeDataHandle(data)->GetResourceInUavState();
             *abiData = resource.Detach();
 #endif
         }
@@ -976,7 +1026,7 @@ namespace Dml
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr)
     {
         Dml::BucketizedBufferAllocator* pAllocationInfo = static_cast<Dml::BucketizedBufferAllocator*>(allocator);
-        return pAllocationInfo->DecodeDataHandle(ptr)->GetResource();
+        return pAllocationInfo->DecodeDataHandle(ptr)->GetResourceInUavState();
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index 31aacc3787818..590dffef488e4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -48,7 +48,7 @@ namespace Dml
         return newCapacity;
     }
 
-    void ReadbackHeap::EnsureReadbackHeap(size_t size) 
+    void ReadbackHeap::EnsureReadbackHeap(size_t size)
     {
         if (!m_readbackHeap)
         {
@@ -76,7 +76,7 @@ namespace Dml
         D3D12_RESOURCE_STATES srcState)
     {
         assert(!dst.empty());
-        
+
         EnsureReadbackHeap(dst.size());
 
         // Copy from the source resource into the readback heap
@@ -100,12 +100,12 @@ namespace Dml
         memcpy(dst.data(), readbackHeapData, dst.size());
         m_readbackHeap->Unmap(0, nullptr);
     }
-    
+
     void ReadbackHeap::ReadbackFromGpu(
         gsl::span<void*> dst,
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
-        D3D12_RESOURCE_STATES srcState)
+        gsl::span<const D3D12_RESOURCE_STATES> srcStates)
     {
         assert(dst.size() == src.size());
         assert(dstSizes.size() == src.size());
@@ -133,7 +133,7 @@ namespace Dml
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
                 0,
-                srcState,
+                srcStates[i],
                 dstSizes[i]);
 
             offset += dstSizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index c596d982b7931..9727dc6ac8752 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -21,13 +21,13 @@ namespace Dml
             ID3D12Resource* src,
             uint64_t srcOffset,
             D3D12_RESOURCE_STATES srcState);
-        
+
         // Overload supporting batching
         void ReadbackFromGpu(
             gsl::span<void*> dst,
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
-            D3D12_RESOURCE_STATES srcState);
+            gsl::span<const D3D12_RESOURCE_STATES> srcStates);
 
     private:
         void EnsureReadbackHeap(size_t size);

From 707c1c92f8db0257d501ab378ed71d14edf6dd00 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 17 Jan 2023 22:25:34 -0800
Subject: [PATCH 02/76] WIP

---
 .../dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 08bffae6e8a5b..10874b0611f7f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -125,7 +125,6 @@ namespace Dml
             resourceId = ++m_currentResourceId;
         }
 
-        assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize);
         assert(resourceWrapper != nullptr);
 
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(

From 0619fa37e9d15b13d0a53341a41faca0b8de09ee Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 17 Jan 2023 23:50:39 -0800
Subject: [PATCH 03/76] WIP

---
 .../DmlExecutionProvider/src/BucketizedBufferAllocator.cpp  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 10874b0611f7f..c1fa576c48574 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -167,7 +167,11 @@ namespace Dml
         gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
         if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResourceInUavState()->GetDesc().Width)
         {
-            assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
+            if (gsl::narrow_cast<gsl::index>(m_pool.size()) <= bucketIndex)
+            {
+                // Ensure there are sufficient buckets
+                m_pool.resize(bucketIndex + 1);
+            }
 
             // Return the resource to the bucket
             Bucket* bucket = &m_pool[bucketIndex];

From 6b62b7228197c4d0a89315e6f118723ee455b733 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 18 Jan 2023 22:07:32 -0800
Subject: [PATCH 04/76] WIP

---
 .../src/BucketizedBufferAllocator.cpp         | 41 +++++--------------
 .../src/DmlCommandRecorder.cpp                | 15 +++++--
 .../src/DmlCommittedResourceAllocator.cpp     | 28 -------------
 .../src/DmlCommittedResourceAllocator.h       | 21 ----------
 .../src/DmlHeapAllocator.cpp                  | 10 +++++
 .../src/DmlHeapAllocator.h                    |  1 +
 .../src/DmlSubAllocator.h                     |  1 +
 .../src/ExecutionContext.cpp                  | 32 +++++++--------
 .../cppwinrt/scenariotestscppwinrt.cpp        | 12 ++++--
 9 files changed, 58 insertions(+), 103 deletions(-)
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index c1fa576c48574..417d2639dad31 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -84,16 +84,18 @@ namespace Dml
 
         ComPtr<DmlResourceWrapper> resourceWrapper;
         uint64_t resourceId = 0;
-        uint64_t bucketSize = 0;
+
+        // Find the bucket for this allocation size
+        gsl::index bucketIndex = GetBucketIndexFromSize(size);
+
+        // Some sub allocators have their own rounding mechanisms or alignment requirements of resources
+        uint64_t bucketSize = m_subAllocator->ComputeRequiredSize(GetBucketSizeFromIndex(bucketIndex));
 
         // Use a pooled resource if the size (post rounding, if requested) matches a bucket size
-        if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == GetBucketSizeFromIndex(GetBucketIndexFromSize(size)))
+        if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == bucketSize)
         {
             Bucket* bucket = nullptr;
 
-            // Find the bucket for this allocation size
-            gsl::index bucketIndex = GetBucketIndexFromSize(size);
-
             if (gsl::narrow_cast<gsl::index>(m_pool.size()) <= bucketIndex)
             {
                 // Ensure there are sufficient buckets
@@ -101,7 +103,6 @@ namespace Dml
             }
 
             bucket = &m_pool[bucketIndex];
-            bucketSize = GetBucketSizeFromIndex(bucketIndex);
 
             if (bucket->resources.empty())
             {
@@ -120,12 +121,13 @@ namespace Dml
         else
         {
             // The allocation will not be pooled.  Construct a new one
-            bucketSize = (size + 3) & ~3;
+            bucketSize = m_subAllocator->ComputeRequiredSize(size);
             resourceWrapper = m_subAllocator->Alloc(bucketSize);
             resourceId = ++m_currentResourceId;
         }
 
         assert(resourceWrapper != nullptr);
+        assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize);
 
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
             this,
@@ -183,31 +185,10 @@ namespace Dml
         {
             // Free the underlying allocation once queued work has completed.
 #ifdef _GAMING_XBOX
-            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInUavState()).Get());
-
-            if (allocInfo->GetResourceInCopySrcState() != nullptr)
-            {
-                m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopySrcState()).Get());
-            }
-
-            if (allocInfo->GetResourceInCopyDstState() != nullptr)
-            {
-                m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResourceInCopyDstState()).Get());
-            }
+            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get());
 #else
-            m_context->QueueReference(allocInfo->GetResourceInUavState());
-
-            if (allocInfo->GetResourceInCopySrcState() != nullptr)
-            {
-                m_context->QueueReference(allocInfo->GetResourceInCopySrcState());
-            }
-
-            if (allocInfo->GetResourceInCopyDstState() != nullptr)
-            {
-                m_context->QueueReference(allocInfo->GetResourceInCopyDstState());
-            }
+            m_context->QueueReference(allocInfo->DetachResourceWrapper().Get());
 #endif
-            allocInfo->DetachResourceWrapper();
         }
 
     #if _DEBUG
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index bd6a5c6b7aa17..d16c0201743db 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -101,8 +101,10 @@ void DmlCommandRecorder::InitializeOperator(
     if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) ||
         (temporaryResourceSize > 0))
     {
-        auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
-        m_currentCommandList->ResourceBarrier(1, &uav);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
     }
 }
 
@@ -168,8 +170,13 @@ void DmlCommandRecorder::ExecuteOperator(
     // Barrier all outputs.
     #pragma warning(push)
     #pragma warning(disable: 6387)
-    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
-    m_currentCommandList->ResourceBarrier(1, &uav);
+
+    // Barrier all outputs.
+    D3D12_RESOURCE_BARRIER barriers[] = {
+        CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+        CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+    m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
+
     #pragma warning(pop)
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
deleted file mode 100644
index d9bfdc3473ca7..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "precomp.h"
-#include "DmlCommittedResourceAllocator.h"
-#include "DmlResourceWrapper.h"
-#include "DmlCommittedResourceWrapper.h"
-
-namespace Dml
-{
-    ComPtr<DmlResourceWrapper> DmlCommittedResourceAllocator::Alloc(size_t size)
-    {
-        ComPtr<ID3D12Resource> resource;
-        auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
-        ORT_THROW_IF_FAILED(m_device->CreateCommittedResource(
-            &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
-            D3D12_HEAP_FLAG_NONE,
-            &buffer,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            nullptr,
-            IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf())
-        ));
-
-        ComPtr<DmlResourceWrapper> resourceWrapper;
-        wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
-        return resourceWrapper;
-    }
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h
deleted file mode 100644
index 7ad48be32a6c9..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "DmlSubAllocator.h"
-
-namespace Dml
-{
-    struct DmlResourceWrapper;
-
-    class DmlCommittedResourceAllocator : public DmlSubAllocator
-    {
-    public:
-        DmlCommittedResourceAllocator(ID3D12Device* device) : m_device(device) {}
-        Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size) final;
-
-    private:
-        ID3D12Device* m_device = nullptr;
-    };
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
index f56312b8ea2cf..2ba44de85b2a8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
@@ -214,6 +214,16 @@ absl::optional<Allocation> D3D12HeapAllocator::TryCreateUntiledAllocation(uint64
     return allocation;
 }
 
+uint64_t D3D12HeapAllocator::ComputeRequiredSize(size_t size)
+{
+    const uint64_t resource_size_in_tiles =
+        1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+    const uint64_t resource_size_in_bytes =
+        resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+
+    return resource_size_in_bytes;
+}
+
 Microsoft::WRL::ComPtr<DmlResourceWrapper> D3D12HeapAllocator::Alloc(size_t size_in_bytes)
 {
     if (size_in_bytes == 0)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
index 877e4b34be6ac..b15eeff3575fe 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
@@ -89,6 +89,7 @@ class D3D12HeapAllocator : public DmlSubAllocator
         uint64_t size_in_bytes);
 
     Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size_in_bytes) final;
+    uint64_t ComputeRequiredSize(size_t size) final;
     void Free(void* ptr, uint64_t size_in_bytes);
     bool TilingEnabled() const { return tiling_enabled_; };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
index cfdaf17710001..033fb15388066 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
@@ -11,6 +11,7 @@ namespace Dml
     {
     public:
         virtual Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size) = 0;
+        virtual uint64_t ComputeRequiredSize(size_t size) = 0;
         virtual ~DmlSubAllocator(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index a894d0660d6ff..1d41d26cf0062 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -15,7 +15,7 @@ namespace Dml
         : m_queue(std::make_shared<CommandQueue>(queue))
         , m_dmlRecorder(d3d12Device, dmlDevice, m_queue)
     {
-        ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));        
+        ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));
     }
 
     void ExecutionContext::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
@@ -55,15 +55,15 @@ namespace Dml
         m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount);
 
         // Reset barrier state
-        if (!barriers.empty())
+        for (auto& barrier : barriers)
         {
-            for (auto& barrier : barriers)
-            {
-                std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
-            }
-
-            m_dmlRecorder.ResourceBarrier(barriers);
+            std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
         }
+
+        // Since this copy may write to GPU memory, we also need to perform an
+        // aliasing barrier
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
+        m_dmlRecorder.ResourceBarrier(barriers);
     }
 
     void ExecutionContext::FillBufferWithPattern(
@@ -78,14 +78,14 @@ namespace Dml
         ID3D12GraphicsCommandList* commandList,
         _Outptr_ ID3D12Fence** fence,
         _Out_ uint64_t* completionValue
-        ) 
+        )
     {
         assert(!m_closed);
 
         SetCommandRecorder(&m_dmlRecorder);
         m_dmlRecorder.ExecuteCommandList(commandList, fence, completionValue);
     }
-       
+
     void ExecutionContext::InitializeOperator(
         IDMLCompiledOperator* op,
         const DML_BINDING_DESC& persistentResourceBinding,
@@ -110,7 +110,7 @@ namespace Dml
     }
 
     void ExecutionContext::AddUAVBarrier()
-    {        
+    {
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -173,9 +173,9 @@ namespace Dml
         m_currentRecorder = nullptr;
         SetCommandRecorder(&m_dmlRecorder);
     }
-    
-    void ExecutionContext::QueueReference(IUnknown* object) 
-    {              
+
+    void ExecutionContext::QueueReference(IUnknown* object)
+    {
         assert(!m_closed);
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
         // value is the one to signal completion.
@@ -186,14 +186,14 @@ namespace Dml
     void ExecutionContext::Close()
     {
         assert(!m_closed);
-        
+
         // Discard unflushed work and clear queued references.  This prevents the circular reference:
         // Kernel --> ProviderImpl -->  Context --> QueuedRefs --> Kernel
         m_queue->Close();
         m_currentRecorder = nullptr;
         m_closed = true;
     }
-    
+
     GpuEvent ExecutionContext::GetCurrentCompletionEvent()
     {
         assert(!m_closed);
diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
index 5d3561076c6aa..cb195acd33090 100644
--- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
+++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -1114,7 +1114,11 @@ static void MsftQuantizedModels() {
   // load a model
   std::wstring filePath = FileHelpers::GetModulePath() + L"coreml_Resnet50_ImageNet-dq.onnx";
   LearningModel model = LearningModel::LoadFromFilePath(filePath);
-  LearningModelSession session(model, LearningModelDevice(LearningModelDeviceKind::DirectX));
+
+  auto device = LearningModelDevice(LearningModelDeviceKind::DirectX);
+  device.as<IMetacommandsController>()->SetMetacommandsEnabled(false);
+
+  LearningModelSession session(model, device);
   // create a binding set
   LearningModelBinding binding(session);
   // bind the input and the output buffers by name
@@ -1525,7 +1529,7 @@ static void BindMultipleCPUBuffersAsInputs(LearningModelDeviceKind kind) {
   buffers.Append(wss::Buffer::CreateCopyFromMemoryBuffer(red));
   buffers.Append(wss::Buffer::CreateCopyFromMemoryBuffer(green));
   buffers.Append(wss::Buffer::CreateCopyFromMemoryBuffer(blue));
-  
+
   // Bind input
   binding.Bind(model.InputFeatures().First().Current().Name(), buffers);
 
@@ -1627,7 +1631,7 @@ static void BindMultipleCPUBuffersAsOutputs(LearningModelDeviceKind kind) {
   red_buffer.try_as<::Windows::Storage::Streams::IBufferByteAccess>()->Buffer(reinterpret_cast<byte**>(&red_bytes));
   green_buffer.try_as<::Windows::Storage::Streams::IBufferByteAccess>()->Buffer(reinterpret_cast<byte**>(&green_bytes));
   blue_buffer.try_as<::Windows::Storage::Streams::IBufferByteAccess>()->Buffer(reinterpret_cast<byte**>(&blue_bytes));
-  
+
   // Verify the output by comparing with the benchmark image
   SoftwareBitmap benchmark_bitmap = FileHelpers::GetSoftwareBitmapFromFile(bmImagePath);
   benchmark_bitmap = SoftwareBitmap::Convert(benchmark_bitmap, BitmapPixelFormat::Bgra8);
@@ -1638,7 +1642,7 @@ static void BindMultipleCPUBuffersAsOutputs(LearningModelDeviceKind kind) {
   wf::IMemoryBufferReference benchmark_reference = benchmark_bitmap_buffer.CreateReference();
   auto benchmark_byte_access = benchmark_reference.as<::Windows::Foundation::IMemoryBufferByteAccess>();
   benchmark_byte_access->GetBuffer(&benchmark_data, &benchmark_size);
-  
+
   // hard code, might need to be modified later.
   const float cMaxErrorRate = 0.06f;
   byte epsilon = 20;

From 3f2910b6eb7abbe21d0beb21182dbfc84938ec34 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 19 Jan 2023 11:47:55 -0800
Subject: [PATCH 05/76] WIP

---
 .../Api.Image/VideoFrameToTensorConverter.cpp    | 16 +++++-----------
 .../scenario/cppwinrt/scenariotestscppwinrt.cpp  |  5 +----
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index c4490db394792..1215548d212c5 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -328,11 +328,6 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
 
   // Validate Tensor Resource
   {
-    D3D12_HEAP_PROPERTIES outputHeapProperties;
-    D3D12_HEAP_FLAGS outputHeapFlags;
-
-    WINML_THROW_IF_FAILED(pOutputResource->GetHeapProperties(&outputHeapProperties, &outputHeapFlags));
-
     UINT64 ullNumElementsTensor = 1;
     for (UINT uiIdx = 0; uiIdx < kImageTensorDimensionCountMax; uiIdx++) {
       WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, tensorDesc.sizes[uiIdx], &ullNumElementsTensor));
@@ -347,8 +342,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
     if (outputDesc.Width < ullTensorSize ||
         outputDesc.Height != 1 ||
         outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
-        !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) ||
-        outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) {
+        !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)) {
       WINML_THROW_IF_FAILED(E_INVALIDARG);
     }
   }
@@ -533,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   command_list_->ResourceBarrier(1, &barrier);
 
   command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize);
-  
+
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
   device_cache.GetCommandQueue()->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists);
@@ -570,9 +564,9 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor(
       gpu_buffer_span);
 
   upload_heap_->Unmap(0, &CD3DX12_RANGE(0, buffer_size_in_bytes));
-  
+
   ResetCommandList(device_cache);
-  
+
   auto barrier1 = CD3DX12_RESOURCE_BARRIER::Transition(output_resource, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST);
   command_list_->ResourceBarrier(1, &barrier1);
   command_list_->CopyBufferRegion(output_resource, 0, upload_heap_.Get(), 0, buffer_size_in_bytes);
@@ -692,4 +686,4 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor(
         inputBounds,
         reinterpret_cast<DirectX::PackedVector::HALF*>(pCPUTensor)));
   }
-}
\ No newline at end of file
+}
diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
index cb195acd33090..18e0c28ef4765 100644
--- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
+++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -1115,10 +1115,7 @@ static void MsftQuantizedModels() {
   std::wstring filePath = FileHelpers::GetModulePath() + L"coreml_Resnet50_ImageNet-dq.onnx";
   LearningModel model = LearningModel::LoadFromFilePath(filePath);
 
-  auto device = LearningModelDevice(LearningModelDeviceKind::DirectX);
-  device.as<IMetacommandsController>()->SetMetacommandsEnabled(false);
-
-  LearningModelSession session(model, device);
+  LearningModelSession session(model, LearningModelDevice(LearningModelDeviceKind::DirectX));
   // create a binding set
   LearningModelBinding binding(session);
   // bind the input and the output buffers by name

From 25bb52d7df70300c6c33d1bd735859ace3f944a9 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 19 Jan 2023 18:24:19 -0800
Subject: [PATCH 06/76] WIP

---
 .../src/BucketizedBufferAllocator.cpp         |   4 +-
 .../src/BucketizedBufferAllocator.h           |  27 +++-
 .../src/DmlBufferRegion.cc                    | 120 ------------------
 .../src/DmlBufferRegion.h                     |  82 ------------
 .../src/DmlCommandRecorder.cpp                |   7 +-
 .../src/DmlCommittedResourceWrapper.h         |  12 +-
 .../src/DmlHeapAllocator.cpp                  |  81 ------------
 .../src/DmlHeapAllocator.h                    |  39 ------
 .../src/DmlReservedResourceWrapper.h          |  11 +-
 .../src/DmlResourceWrapper.h                  |   9 +-
 .../src/DmlTaggedPointer.cpp                  |  33 -----
 .../src/DmlTaggedPointer.h                    |  34 -----
 .../src/ExecutionProvider.cpp                 |  65 +++-------
 13 files changed, 67 insertions(+), 457 deletions(-)
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 417d2639dad31..18c747079f183 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -127,7 +127,7 @@ namespace Dml
         }
 
         assert(resourceWrapper != nullptr);
-        assert(resourceWrapper->GetResourceInUavState()->GetDesc().Width == bucketSize);
+        assert(resourceWrapper->GetUavResource()->GetDesc().Width == bucketSize);
 
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
             this,
@@ -167,7 +167,7 @@ namespace Dml
 
         // Free the resource to the pool if its size matches a bucket size
         gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
-        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResourceInUavState()->GetDesc().Width)
+        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetUavResource()->GetDesc().Width)
         {
             if (gsl::narrow_cast<gsl::index>(m_pool.size()) <= bucketIndex)
             {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 3d95bd029aad8..75025a4af0f8b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -46,19 +46,34 @@ namespace Dml
             return m_owner;
         }
 
-        ID3D12Resource* GetResourceInUavState() const
+        ID3D12Resource* GetUavResource() const
         {
-            return m_resourceWrapper->GetResourceInUavState();
+            return m_resourceWrapper->GetUavResource();
         }
 
-        ID3D12Resource* GetResourceInCopySrcState() const
+        ID3D12Resource* GetCopySrcResource() const
         {
-            return m_resourceWrapper->GetResourceInCopySrcState();
+            return m_resourceWrapper->GetCopySrcResource();
         }
 
-        ID3D12Resource* GetResourceInCopyDstState() const
+        ID3D12Resource* GetCopyDstResource() const
         {
-            return m_resourceWrapper->GetResourceInCopyDstState();
+            return m_resourceWrapper->GetCopyDstResource();
+        }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const
+        {
+            return m_resourceWrapper->GetDefaultUavState();
+        }
+
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const
+        {
+            return m_resourceWrapper->GetDefaultCopySrcState();
+        }
+
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const
+        {
+            return m_resourceWrapper->GetDefaultCopyDstState();
         }
 
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
deleted file mode 100644
index 8d6fbd0551083..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "precomp.h"
-#include "DmlBufferRegion.h"
-
-namespace Dml
-{
-
-D3D12BufferRegion::D3D12BufferRegion(
-    uint64_t offset,
-    uint64_t size_in_bytes,
-    ID3D12Resource* resource_uav_state,
-    ID3D12Resource* resource_copy_src_state,
-    ID3D12Resource* resource_copy_dst_state)
-    : resource_uav_state_(resource_uav_state),
-      resource_copy_src_state_(resource_copy_src_state),
-      resource_copy_dst_state_(resource_copy_dst_state),
-      offset_(offset),
-      size_in_bytes_(size_in_bytes)
-{
-    // Get a raw pointer to the first non-null resource passed in. At least one
-    // resource must be provided.
-    first_valid_resource_ = resource_uav_state_;
-    if (!first_valid_resource_)
-    {
-        first_valid_resource_ = resource_copy_src_state_;
-    }
-    if (!first_valid_resource_)
-    {
-        first_valid_resource_ = resource_copy_dst_state_;
-    }
-    ORT_THROW_HR_IF(E_UNEXPECTED, first_valid_resource_ == nullptr);
-
-    // Regions cannot be empty.
-    ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ == 0);
-
-    // Regions cannot extend beyond the size of the resource.
-    uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
-    ORT_THROW_HR_IF(E_UNEXPECTED, offset_ >= buffer_size);
-    ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes_ > buffer_size - offset);
-
-    // All three resources, if provided, must be identical aside from state.
-    assert(
-        first_valid_resource_->GetDesc().Dimension ==
-        D3D12_RESOURCE_DIMENSION_BUFFER);
-    assert(
-        !resource_uav_state ||
-        (resource_uav_state->GetDesc().Dimension ==
-             D3D12_RESOURCE_DIMENSION_BUFFER &&
-         resource_uav_state->GetDesc().Width == buffer_size));
-    assert(
-        !resource_copy_src_state_ ||
-        (resource_copy_src_state_->GetDesc().Dimension ==
-             D3D12_RESOURCE_DIMENSION_BUFFER &&
-         resource_copy_src_state_->GetDesc().Width == buffer_size));
-    assert(
-        !resource_copy_dst_state_ ||
-        (resource_copy_dst_state_->GetDesc().Dimension ==
-             D3D12_RESOURCE_DIMENSION_BUFFER &&
-         resource_copy_dst_state_->GetDesc().Width == buffer_size));
-}
-
-D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
-{
-    std::swap(this->resource_uav_state_, that.resource_uav_state_);
-    std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-    std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
-    std::swap(this->offset_, that.offset_);
-    std::swap(this->size_in_bytes_, that.size_in_bytes_);
-    std::swap(this->first_valid_resource_, that.first_valid_resource_);
-}
-
-D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
-{
-    std::swap(this->resource_uav_state_, that.resource_uav_state_);
-    std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-    std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
-    std::swap(this->offset_, that.offset_);
-    std::swap(this->size_in_bytes_, that.size_in_bytes_);
-    std::swap(this->first_valid_resource_, that.first_valid_resource_);
-    return *this;
-}
-
-ID3D12Resource* D3D12BufferRegion::GetResourceInUavState() const
-{
-    return resource_uav_state_;
-}
-
-ID3D12Resource* D3D12BufferRegion::GetResourceInCopySrcState() const
-{
-    return resource_copy_src_state_;
-}
-
-ID3D12Resource* D3D12BufferRegion::GetResourceInCopyDstState() const
-{
-    return resource_copy_dst_state_;
-}
-
-uint64_t D3D12BufferRegion::Offset() const
-{
-    return first_valid_resource_ ? offset_ : 0;
-}
-
-uint64_t D3D12BufferRegion::SizeInBytes() const
-{
-    return first_valid_resource_ ? size_in_bytes_ : 0;
-}
-
-DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
-{
-    if (!resource_uav_state_)
-    {
-        return DML_BUFFER_BINDING{};
-    }
-
-    return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
-}
-
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
deleted file mode 100644
index f8c1033261c56..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace Dml
-{
-
-class D3D12HeapAllocator;
-
-// Represents a region of a D3D12 buffer resource. A buffer region has an
-// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in
-// bytes from the beginning of that buffer, and a size in bytes of the region.
-class D3D12BufferRegion
-{
-  public:
-    D3D12BufferRegion() = default;
-
-    // References a region of a buffer. The respective ID3D12Resource objects
-    // must be in the appropriate states. Each resource is optional, but if more
-    // than one are provided they must map to the same region of memory.
-    D3D12BufferRegion(
-        uint64_t offset,
-        uint64_t size_in_bytes,
-        ID3D12Resource* resource_uav_state,
-        ID3D12Resource* resource_copy_src_state,
-        ID3D12Resource* resource_copy_dst_state);
-
-    // Move-only
-    D3D12BufferRegion(const D3D12BufferRegion&) = delete;
-    D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete;
-    D3D12BufferRegion(D3D12BufferRegion&&);
-    D3D12BufferRegion& operator=(D3D12BufferRegion&&);
-
-    ID3D12Resource* GetResourceInUavState() const;
-
-    // NOTE: may be any state that is valid as a copy source (COPY_SRC,
-    // GENERIC_READ, or COMMON).
-    ID3D12Resource* GetResourceInCopySrcState() const;
-
-    ID3D12Resource* GetResourceInCopyDstState() const;
-
-    uint64_t Offset() const;
-    uint64_t SizeInBytes() const;
-
-    DML_BUFFER_BINDING GetBufferBinding() const;
-
-    explicit operator bool() const { return first_valid_resource_ != nullptr; }
-
-    // Creates a subregion at an offset from the start of this region. If no
-    // size is provided the region runs to the end of the current region.
-    inline D3D12BufferRegion Subregion(
-        uint64_t offset,
-        uint64_t size_in_bytes = 0) const
-    {
-        // start of subregion must be within current region
-        ORT_THROW_HR_IF(E_UNEXPECTED, offset >= size_in_bytes_);
-        size_in_bytes =
-            size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
-        // end of subregion must be within current region
-        ORT_THROW_HR_IF(E_UNEXPECTED, size_in_bytes > size_in_bytes_ - offset);
-
-        return D3D12BufferRegion(
-            offset_ + offset,
-            size_in_bytes,
-            resource_uav_state_,
-            resource_copy_src_state_,
-            resource_copy_dst_state_);
-    }
-
-  private:
-    ID3D12Resource* resource_uav_state_ = nullptr;
-    ID3D12Resource* resource_copy_src_state_ = nullptr;
-    ID3D12Resource* resource_copy_dst_state_ = nullptr;
-    uint64_t offset_ = 0;
-    uint64_t size_in_bytes_ = 0;
-
-    // Pointer to the first resource above that isn't null.
-    ID3D12Resource* first_valid_resource_ = nullptr;
-};
-
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index d16c0201743db..7f2fdafbbeb60 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -62,12 +62,15 @@ void DmlCommandRecorder::InitializeOperator(
         // Allocate and immediately free a temporary buffer. The buffer resource will still be
         // alive (managed by the pool); freeing allows the resource to be shared with other operators.
         void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize), AllocatorRoundingMode::Enabled);
+
+
+
         if (!tempResourceHandle)
         {
             ORT_THROW_HR(E_OUTOFMEMORY);
         }
 
-        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState();
+        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource();
         allocator->Free(tempResourceHandle);
 
         // Bind the temporary resource.
@@ -145,7 +148,7 @@ void DmlCommandRecorder::ExecuteOperator(
             ORT_THROW_HR(E_OUTOFMEMORY);
         }
 
-        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetResourceInUavState();
+        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource();
         allocator->Free(tempResourceHandle);
 
         // Bind the temporary resource.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index e86ca4b52b4f2..f786cca837f06 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -9,9 +9,15 @@ namespace Dml
     {
     public:
         DmlCommittedResourceWrapper(ComPtr<ID3D12Resource>&& d3d12Resource) : m_d3d12Resource(std::move(d3d12Resource)) {}
-        ID3D12Resource* GetResourceInUavState() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetResourceInCopySrcState() const final { return nullptr; }
-        ID3D12Resource* GetResourceInCopyDstState() const final { return nullptr; }
+
+        // Committed resources use the same resource for all states and use barriers to transition between states
+        ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
index 2ba44de85b2a8..bdda99ae6f91a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
@@ -3,8 +3,6 @@
 
 #include "precomp.h"
 #include "DmlHeapAllocator.h"
-#include "DmlTaggedPointer.h"
-#include "DmlBufferRegion.h"
 #include "DmlReservedResourceWrapper.h"
 
 namespace Dml
@@ -245,83 +243,4 @@ Microsoft::WRL::ComPtr<DmlResourceWrapper> D3D12HeapAllocator::Alloc(size_t size
     return resourceWrapper;
 }
 
-void D3D12HeapAllocator::Free(void* ptr, uint64_t size_in_bytes)
-{
-    ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr);
-
-    TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
-    ORT_THROW_HR_IF(E_UNEXPECTED, tagged_ptr.offset != 0);
-
-    // We need to access (mutable) state after this point, so we need to lock
-    std::unique_lock<std::mutex> lock(mutex_);
-
-    auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
-
-    ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end());
-
-    ReleaseAllocationID(tagged_ptr.allocation_id);
-
-    // Frees the ID3D12Heap
-    allocations_by_id_.erase(it);
-}
-
-D3D12BufferRegion D3D12HeapAllocator::CreateBufferRegion(
-    const void* ptr,
-    uint64_t size_in_bytes)
-{
-    ORT_THROW_HR_IF(E_UNEXPECTED, ptr == nullptr);
-
-    TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
-
-    // We need to access (mutable) state after this point, so we need to lock
-    std::unique_lock<std::mutex> lock(mutex_);
-
-    // Find the allocation corresponding to this pointer
-    auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
-    ORT_THROW_HR_IF(E_UNEXPECTED, it == allocations_by_id_.end());
-
-    Allocation* allocation = &it->second;
-
-    return D3D12BufferRegion(
-        tagged_ptr.offset,
-        size_in_bytes,
-        allocation->resource_uav_state.Get(),
-        allocation->resource_copy_src_state.Get(),
-        allocation->resource_copy_dst_state.Get());
-}
-
-absl::optional<uint32_t> D3D12HeapAllocator::TryReserveAllocationID()
-{
-    // The mutex must already be held
-    assert(!mutex_.try_lock());
-
-    if (!free_allocation_ids_.empty())
-    {
-        // Return a free ID from the pool
-        uint32_t id = free_allocation_ids_.back();
-        free_allocation_ids_.pop_back();
-        return id;
-    }
-
-    static constexpr uint32_t kMaxAllocationID =
-        (1 << TaggedPointer::kAllocationIDBits) - 1;
-    if (current_allocation_id_ == kMaxAllocationID)
-    {
-        // We've reached the maximum number of allocations!
-        return absl::nullopt;
-    }
-
-    ++current_allocation_id_;
-    return current_allocation_id_;
-}
-
-void D3D12HeapAllocator::ReleaseAllocationID(uint32_t id)
-{
-    // The mutex must already be held
-    assert(!mutex_.try_lock());
-
-    // Add it to the pool of free IDs
-    free_allocation_ids_.push_back(id);
-}
-
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
index b15eeff3575fe..ad86107d6b05c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
@@ -3,9 +3,7 @@
 
 #pragma once
 
-#include "absl/container/flat_hash_map.h"
 #include "DmlSubAllocator.h"
-#include "DmlBufferRegion.h"
 
 namespace Dml
 {
@@ -63,12 +61,6 @@ class D3D12HeapAllocator : public DmlSubAllocator
     // local video memory fragmentation without requiring lots of heaps.
     static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512;
 
-    // The largest single allocation supported by this allocator. We use 4GB
-    // minus a MB to avoid edge cases in hw/drivers that aren't expecting such
-    // large allocations.
-    static constexpr uint64_t kDefaultMaxAllocationSizeInBytes =
-        (1ull << 32) - (1ull << 20);
-
     D3D12HeapAllocator(
         ID3D12Device* device,
         ID3D12CommandQueue* queue,
@@ -77,20 +69,8 @@ class D3D12HeapAllocator : public DmlSubAllocator
         D3D12_RESOURCE_FLAGS resource_flags,
         D3D12_RESOURCE_STATES initial_state);
 
-    // Creates a reserved or placed resource buffer over the given memory range.
-    // The physical D3D12 resource may be larger than the requested size, so
-    // callers must ensure to use the offset/size returned in the
-    // D3D12BufferRegion else risk out of bounds access. Note that in practice
-    // the ID3D12Resource is cached, so this call typically has a lower cost
-    // than a call to ID3D12Device::CreatePlacedResource or
-    // CreateReservedResource.
-    D3D12BufferRegion CreateBufferRegion(
-        const void* ptr,
-        uint64_t size_in_bytes);
-
     Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size_in_bytes) final;
     uint64_t ComputeRequiredSize(size_t size) final;
-    void Free(void* ptr, uint64_t size_in_bytes);
     bool TilingEnabled() const { return tiling_enabled_; };
 
   private:
@@ -105,25 +85,6 @@ class D3D12HeapAllocator : public DmlSubAllocator
     bool tiling_enabled_;
     uint64_t max_heap_size_in_tiles_;
 
-    // The largest allocation ID we've returned so far (or 0 if we've never done
-    // so). Note that our allocation IDs start at 1 (not 0) to ensure that it
-    // isn't possible for a valid allocation to have a pointer value of
-    // 0x00000000.
-    uint32_t current_allocation_id_ = 0;
-
-    // A list of unused allocation IDs. This is for re-use of IDs once they get
-    // freed. We only bump the max_allocation_id_ once there are no more free
-    // IDs.
-    std::vector<uint32_t> free_allocation_ids_;
-
-    absl::flat_hash_map<uint32_t, Allocation> allocations_by_id_;
-
-    // Retrieves a free allocation ID, or nullopt if no more IDs are available.
-    absl::optional<uint32_t> TryReserveAllocationID();
-
-    // Releases an allocation ID back to the pool of IDs.
-    void ReleaseAllocationID(uint32_t id);
-
   private:
     absl::optional<Allocation> TryCreateTiledAllocation(uint64_t size_in_bytes);
     absl::optional<Allocation> TryCreateUntiledAllocation(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 9d52c4e8c0445..413ade92daf51 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "DmlResourceWrapper.h"
-#include "DmlBufferRegion.h"
 #include "DmlHeapAllocator.h"
 
 namespace Dml
@@ -11,9 +10,13 @@ namespace Dml
     {
     public:
         DmlReservedResourceWrapper(Allocation&& allocation) : m_allocation(std::move(allocation)) {}
-        ID3D12Resource* GetResourceInUavState() const final { return m_allocation.resource_uav_state.Get(); }
-        ID3D12Resource* GetResourceInCopySrcState() const final { return m_allocation.resource_copy_src_state.Get(); }
-        ID3D12Resource* GetResourceInCopyDstState() const final { return m_allocation.resource_copy_dst_state.Get(); }
+        ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
+        ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
+        ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; }
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
         Allocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index e600cee0589d0..03e9f762b7eb4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,9 +11,12 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
-        virtual ID3D12Resource* GetResourceInUavState() const = 0;
-        virtual ID3D12Resource* GetResourceInCopySrcState() const = 0;
-        virtual ID3D12Resource* GetResourceInCopyDstState() const = 0;
+        virtual ID3D12Resource* GetUavResource() const = 0;
+        virtual ID3D12Resource* GetCopySrcResource() const = 0;
+        virtual ID3D12Resource* GetCopyDstResource() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
deleted file mode 100644
index ba3f4cb85697e..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "precomp.h"
-#include "DmlTaggedPointer.h"
-#include <cassert>
-
-namespace Dml
-{
-/*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr)
-{
-    uint64_t ptr_val = reinterpret_cast<uint64_t>(ptr);
-
-    static constexpr uint64_t kAllocationIDMask =
-        (1ull << kAllocationIDBits) - 1;
-    static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1;
-
-    TaggedPointer tagged_ptr;
-    tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask;
-    tagged_ptr.offset = (ptr_val & kOffsetMask);
-
-    return tagged_ptr;
-}
-
-/*static*/ void* TaggedPointer::Pack(uint32_t allocation_id, uint64_t offset)
-{
-    assert(allocation_id < (1ull << kAllocationIDBits));
-    assert(offset < (1ull << kOffsetBits));
-    uint64_t ptr = ((uint64_t)allocation_id << kOffsetBits) | offset;
-
-    return reinterpret_cast<void*>(ptr);
-}
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
deleted file mode 100644
index a161007a138ea..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <climits>
-#include <cstdint>
-
-namespace Dml
-{
-
-// D3D12HeapAllocator and D3D12DescriptorHeapAllocator encode the allocation ID
-// into the high bits of the pointers it returns, while the low bits are used as
-// an offset into the allocation. Note that since the layout of bitfields is
-// implementation-defined, you can't just cast a void* into a TaggedPointer: it
-// must be done using masks and shifts.
-struct TaggedPointer
-{
-    static constexpr uint64_t kAllocationIDBits = 24;
-    static constexpr uint64_t kOffsetBits = 40;
-
-    uint64_t allocation_id : kAllocationIDBits;
-    uint64_t offset : kOffsetBits;
-
-    static void* Pack(uint32_t allocation_id, uint64_t offset);
-    static TaggedPointer Unpack(const void* ptr);
-};
-
-static_assert(
-    sizeof(TaggedPointer) == sizeof(void*),
-    "DML requires a 64-bit architecture");
-static_assert(TaggedPointer::kAllocationIDBits + TaggedPointer::kOffsetBits == sizeof(void*) * CHAR_BIT,
-    "DML requires a 64-bit architecture");
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 6dc6f046727ab..fddd3267d9770 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -123,7 +123,7 @@ namespace Dml
 
         const auto* allocInfo = m_allocator->DecodeDataHandle(allocation.Get());
 
-        ComPtr<ID3D12Resource> resource = allocInfo->GetResourceInUavState();
+        ComPtr<ID3D12Resource> resource = allocInfo->GetUavResource();
         resource.CopyTo(d3dResource);
         *pooledResource = allocation.Detach();
         return S_OK;
@@ -136,7 +136,7 @@ namespace Dml
         ORT_TRY
         {
             const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(allocation);
-            return allocInfo->GetResourceInUavState();
+            return allocInfo->GetUavResource();
         }
         ORT_CATCH_GENERIC
         {
@@ -342,7 +342,7 @@ namespace Dml
                 {
                     assert(tensor->IsDataInterface());
                     const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(tensor).GetDataInterface().Get());
-                    ID3D12Resource* resource = allocInfo->GetResourceInUavState();
+                    ID3D12Resource* resource = allocInfo->GetUavResource();
                     D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
                     bufferBindings.push_back({ resource, 0, resourceDesc.Width });
                     bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
@@ -433,15 +433,8 @@ namespace Dml
             //
             const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get());
 
-            ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr
-                ? dstAllocInfo->GetResourceInUavState()
-                : dstAllocInfo->GetResourceInCopyDstState();
-
-            // When resources in dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise,
-            // take the slower path of adding a barrier (e.g. committed resources).
-            const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
+            ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource();
+            const auto dstState = dstAllocInfo->GetDefaultCopyDstState();
 
             const void* srcData = src->GetData();
 
@@ -457,15 +450,8 @@ namespace Dml
             void* dstData = dst->GetData();
             const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get());
 
-            ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr
-                ? srcAllocInfo->GetResourceInUavState()
-                : srcAllocInfo->GetResourceInCopySrcState();
-
-            // When resources in src state exist (e.g. reserved resources), we can avoid barriers. Otherwise,
-            // take the slower path of adding a barrier (e.g. committed resources).
-            const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource();
+            const auto srcState = srcAllocInfo->GetDefaultCopySrcState();
 
             const uint64_t srcOffset = 0;
 
@@ -480,23 +466,11 @@ namespace Dml
             const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get());
             const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get());
 
-            ID3D12Resource* srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr
-                ? srcAllocInfo->GetResourceInUavState()
-                : srcAllocInfo->GetResourceInCopySrcState();
-
-            ID3D12Resource* dstData = dstAllocInfo->GetResourceInCopyDstState() == nullptr
-                ? dstAllocInfo->GetResourceInUavState()
-                : dstAllocInfo->GetResourceInCopyDstState();
+            ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource();
+            const auto srcState = srcAllocInfo->GetDefaultCopySrcState();
 
-            // When resources in src and dst state exist (e.g. reserved resources), we can avoid barriers. Otherwise,
-            // take the slower path of adding a barrier (e.g. committed resources).
-            const auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
-
-            const auto dstState = dstAllocInfo->GetResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
+            ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource();
+            const auto dstState = dstAllocInfo->GetDefaultCopyDstState();
 
             m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes);
         }
@@ -522,7 +496,7 @@ namespace Dml
         if (mlTensor != nullptr)
         {
             const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(mlTensor.Get());
-            ID3D12Resource* dstData = dstAllocInfo->GetResourceInUavState();
+            ID3D12Resource* dstData = dstAllocInfo->GetUavResource();
             m_context->FillBufferWithPattern(dstData, rawValue);
         }
 
@@ -818,13 +792,8 @@ namespace Dml
             dstDatas.push_back(dstWrapper.GetData());
             const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(&srcWrapper).GetDataInterface().Get());
 
-            auto srcData = srcAllocInfo->GetResourceInCopySrcState() == nullptr
-                ? srcAllocInfo->GetResourceInUavState()
-                : srcAllocInfo->GetResourceInCopySrcState();
-
-            auto srcState = srcAllocInfo->GetResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            auto srcData = srcAllocInfo->GetCopySrcResource();
+            auto srcState = srcAllocInfo->GetDefaultCopySrcState();
 
             srcDatas.push_back(srcData);
             srcStates.push_back(srcState);
@@ -886,10 +855,10 @@ namespace Dml
         else
         {
 #ifdef _GAMING_XBOX
-            ComPtr<GraphicsUnknownWrapper> wrappedResource = Microsoft::WRL::Make<GraphicsUnknownWrapper>(m_allocator->DecodeDataHandle(data)->GetResourceInUavState());
+            ComPtr<GraphicsUnknownWrapper> wrappedResource = Microsoft::WRL::Make<GraphicsUnknownWrapper>(m_allocator->DecodeDataHandle(data)->GetUavResource());
             *abiData = wrappedResource.Detach();
 #else
-            ComPtr<ID3D12Resource> resource = m_allocator->DecodeDataHandle(data)->GetResourceInUavState();
+            ComPtr<ID3D12Resource> resource = m_allocator->DecodeDataHandle(data)->GetUavResource();
             *abiData = resource.Detach();
 #endif
         }
@@ -1026,7 +995,7 @@ namespace Dml
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr)
     {
         Dml::BucketizedBufferAllocator* pAllocationInfo = static_cast<Dml::BucketizedBufferAllocator*>(allocator);
-        return pAllocationInfo->DecodeDataHandle(ptr)->GetResourceInUavState();
+        return pAllocationInfo->DecodeDataHandle(ptr)->GetUavResource();
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)

From 92f51a33835b1f2654f2f8294b9979ef2a39c28c Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 23 Jan 2023 10:32:14 -0800
Subject: [PATCH 07/76] Remove sub allocator

---
 .../src/BucketizedBufferAllocator.cpp           |  4 ++--
 .../src/BucketizedBufferAllocator.h             |  6 +++---
 .../DmlExecutionProvider/src/DmlHeapAllocator.h |  8 ++++----
 .../DmlExecutionProvider/src/DmlSubAllocator.h  | 17 -----------------
 4 files changed, 9 insertions(+), 26 deletions(-)
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 18c747079f183..79a195529679d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -6,7 +6,7 @@
 #include "core/session/onnxruntime_c_api.h"
 
 #include "BucketizedBufferAllocator.h"
-#include "DmlSubAllocator.h"
+#include "DmlHeapAllocator.h"
 // #define PRINT_OUTSTANDING_ALLOCATIONS
 
 namespace Dml
@@ -37,7 +37,7 @@ namespace Dml
     BucketizedBufferAllocator::BucketizedBufferAllocator(
         ID3D12Device* device,
         std::shared_ptr<ExecutionContext> context,
-        std::unique_ptr<DmlSubAllocator>&& subAllocator
+        std::unique_ptr<D3D12HeapAllocator>&& subAllocator
         )
         : onnxruntime::IAllocator(
             OrtMemoryInfo(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 75025a4af0f8b..254631652cc47 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -9,7 +9,7 @@
 
 namespace Dml
 {
-    class DmlSubAllocator;
+    class D3D12HeapAllocator;
 
     class CPUAllocator : public onnxruntime::IAllocator
     {
@@ -120,7 +120,7 @@ namespace Dml
         BucketizedBufferAllocator(
             ID3D12Device* device,
             std::shared_ptr<ExecutionContext> context,
-            std::unique_ptr<DmlSubAllocator>&& subAllocator);
+            std::unique_ptr<D3D12HeapAllocator>&& subAllocator);
 
         // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc.
         const AllocationInfo* DecodeDataHandle(const void* opaqueHandle);
@@ -168,7 +168,7 @@ namespace Dml
         uint64_t m_currentResourceId = 0;
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled;
         std::shared_ptr<ExecutionContext> m_context;
-        std::unique_ptr<DmlSubAllocator> m_subAllocator;
+        std::unique_ptr<D3D12HeapAllocator> m_subAllocator;
 
     #if _DEBUG
         // Useful for debugging; keeps track of all allocations that haven't been freed yet
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
index ad86107d6b05c..6e13ad71f5877 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "DmlSubAllocator.h"
+#include "DmlResourceWrapper.h"
 
 namespace Dml
 {
@@ -53,7 +53,7 @@ struct Allocation
 // this case it is better make more but smaller allocations (resulting in
 // smaller heaps); this fallback path is only retained as a last resort for
 // older hardware.
-class D3D12HeapAllocator : public DmlSubAllocator
+class D3D12HeapAllocator
 {
   public:
     // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
@@ -69,8 +69,8 @@ class D3D12HeapAllocator : public DmlSubAllocator
         D3D12_RESOURCE_FLAGS resource_flags,
         D3D12_RESOURCE_STATES initial_state);
 
-    Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size_in_bytes) final;
-    uint64_t ComputeRequiredSize(size_t size) final;
+    Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size_in_bytes);
+    uint64_t ComputeRequiredSize(size_t size);
     bool TilingEnabled() const { return tiling_enabled_; };
 
   private:
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
deleted file mode 100644
index 033fb15388066..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace Dml
-{
-    struct DmlResourceWrapper;
-
-    class DmlSubAllocator
-    {
-    public:
-        virtual Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size) = 0;
-        virtual uint64_t ComputeRequiredSize(size_t size) = 0;
-        virtual ~DmlSubAllocator(){}
-    };
-}

From c0cbcaeb687100a2e5008233ea45416a0e8ae358 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 24 Jan 2023 00:00:36 -0800
Subject: [PATCH 08/76] WIP

---
 .../inc/DmlExecutionProvider.h                |   6 +-
 .../inc/IWinmlExecutionProvider.h             |  25 +-
 .../src/BucketizedBufferAllocator.cpp         | 452 +++++++++++++-----
 .../src/BucketizedBufferAllocator.h           | 213 ++++-----
 .../src/DmlAllocationInfo.cpp                 |  19 +
 .../src/DmlAllocationInfo.h                   |  80 ++++
 .../src/DmlBfcAllocator.h                     |  29 ++
 .../src/DmlBufferRegion.cc                    | 120 +++++
 .../src/DmlBufferRegion.h                     |  79 +++
 .../src/DmlCommandRecorder.cpp                |  38 +-
 .../src/DmlCommandRecorder.h                  |  16 +-
 .../src/DmlCpuAllocator.cpp                   |  38 ++
 .../src/DmlCpuAllocator.h                     |  20 +
 .../src/DmlGpuAllocator.h                     |  39 ++
 .../src/DmlGraphFusionHelper.cpp              |   6 +-
 .../src/DmlHeapAllocation.h                   |  29 ++
 .../src/DmlHeapAllocator.cpp                  | 246 ----------
 .../src/DmlHeapAllocator.h                    |  96 ----
 .../src/DmlManagedBufferRegion.h              |  26 +
 .../src/DmlReservedResourceWrapper.h          |  13 +-
 .../src/DmlTaggedPointer.cpp                  |  41 ++
 .../src/DmlTaggedPointer.h                    |  43 ++
 .../src/ExecutionContext.cpp                  |   5 +-
 .../src/ExecutionContext.h                    |  13 +-
 .../src/ExecutionProvider.cpp                 | 184 +++----
 .../src/ExecutionProvider.h                   |  33 +-
 .../src/FusedGraphKernel.cpp                  |  22 +-
 .../src/IExecutionProvider.h                  |   8 +-
 .../src/MLOperatorAuthorImpl.cpp              |  22 +-
 .../src/Operators/DmlOperator.cpp             |  31 +-
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |   3 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |   1 +
 .../MLOperatorAuthorHelper.h                  |   4 -
 33 files changed, 1222 insertions(+), 778 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index 9dfbd0e7ea0e0..fe07ccf08899e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -31,12 +31,12 @@ namespace Dml
         bool enableMetacommands = true);
 
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr);
-    void FlushContext(onnxruntime::IExecutionProvider* provider);    
+    void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
-    
+
     onnxruntime::common::Status CopyTensor(
-        onnxruntime::IExecutionProvider* provider, 
+        onnxruntime::IExecutionProvider* provider,
         const onnxruntime::Tensor& src, onnxruntime::Tensor& dst
     );
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 501a66bdfa711..52f5a104b0379 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -22,6 +22,11 @@ namespace onnxruntime
     class Node;
 }
 
+namespace Dml
+{
+    class DmlManagedBufferRegion;
+}
+
 namespace Windows::AI::MachineLearning::Adapter
 {
     interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698"))
@@ -29,7 +34,7 @@ namespace Windows::AI::MachineLearning::Adapter
     {
     public:
         // Hold a reference to an object until preceding work in the queue is complete.  This
-        // only needs to be handled by providers which hide the asynchronous nature of 
+        // only needs to be handled by providers which hide the asynchronous nature of
         // computation, and involve resoures which cannot be automatically by work in the
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
@@ -40,12 +45,16 @@ namespace Windows::AI::MachineLearning::Adapter
             IUnknown** dataCopy) const = 0;
 
         virtual void GetABIDataInterface(
-            bool isInternalOperator,
-            IUnknown* data,
+            void* data,
             IUnknown** abiData) const = 0;
-        
+
+        virtual void GetManagedBufferRegion(
+            void* data,
+            uint64_t size,
+            Dml::DmlManagedBufferRegion** abiData) const = 0;
+
         virtual uint64_t TryGetPooledAllocationId(
-            IUnknown* data,
+            void* data,
             bool isInternalOperator) = 0;
 
         virtual void GetABIExecutionInterfaceAndInvalidateState(
@@ -63,7 +72,7 @@ namespace Windows::AI::MachineLearning::Adapter
             uint32_t resourceCount,
             IUnknown** resources) = 0;
 
-        // Waits for flushed work, discards unflushed work, and discards associated references to 
+        // Waits for flushed work, discards unflushed work, and discards associated references to
         // prevent circular references.  Must be the last call on the object before destruction.
         virtual void Close() = 0;
     };
@@ -89,7 +98,7 @@ namespace Windows::AI::MachineLearning::Adapter
     };
 
     using GraphNodeFactory = std::function<void(
-        const onnxruntime::Node& node, 
+        const onnxruntime::Node& node,
         MLOperatorTensorGetter& constantInputGetter,
         const void* executionHandle,
         /*out*/ DmlGraphNodeCreateInfo* graphNodeCreateInfo
@@ -111,4 +120,4 @@ namespace Windows::AI::MachineLearning::Adapter
     };
 
     using InternalRegistrationInfoMap = std::unordered_map<onnxruntime::KernelDef*, std::shared_ptr<InternalRegistrationInfo>>;
-}
\ No newline at end of file
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 79a195529679d..df12c1567d5be 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -4,21 +4,13 @@
 #include "precomp.h"
 
 #include "core/session/onnxruntime_c_api.h"
-
 #include "BucketizedBufferAllocator.h"
-#include "DmlHeapAllocator.h"
-// #define PRINT_OUTSTANDING_ALLOCATIONS
+#include "DmlReservedResourceWrapper.h"
+#include "DmlBufferRegion.h"
+#include "DmlManagedBufferRegion.h"
 
 namespace Dml
 {
-    AllocationInfo::~AllocationInfo()
-    {
-        if (m_owner)
-        {
-            m_owner->FreeResource(this, m_pooledResourceId);
-        }
-    }
-
     BucketizedBufferAllocator::~BucketizedBufferAllocator()
     {
 #ifdef PRINT_OUTSTANDING_ALLOCATIONS
@@ -34,24 +26,6 @@ namespace Dml
 #endif
     }
 
-    BucketizedBufferAllocator::BucketizedBufferAllocator(
-        ID3D12Device* device,
-        std::shared_ptr<ExecutionContext> context,
-        std::unique_ptr<D3D12HeapAllocator>&& subAllocator
-        )
-        : onnxruntime::IAllocator(
-            OrtMemoryInfo(
-                "DML",
-                OrtAllocatorType::OrtDeviceAllocator,
-                OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
-            )
-        ),
-        m_device(device),
-        m_context(context),
-        m_subAllocator(std::move(subAllocator))
-    {
-    }
-
     /*static*/ gsl::index BucketizedBufferAllocator::GetBucketIndexFromSize(uint64_t size)
     {
         assert(size != 0);
@@ -72,88 +46,287 @@ namespace Dml
         return (1ull << (index + c_minResourceSizeExponent));
     }
 
-    void* BucketizedBufferAllocator::Alloc(size_t size)
+    void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        return Alloc(size, m_defaultRoundingMode);
+        m_defaultRoundingMode = roundingMode;
     }
 
-    void* BucketizedBufferAllocator::Alloc(size_t size, AllocatorRoundingMode roundingMode)
+    static bool GetTilingEnabled(ID3D12Device* device)
     {
-        // For some reason lotus likes requesting 0 bytes of memory
-        size = std::max<size_t>(1, size);
+        D3D12_FEATURE_DATA_D3D12_OPTIONS options = {};
+        if (SUCCEEDED(device->CheckFeatureSupport(
+                D3D12_FEATURE_D3D12_OPTIONS,
+                &options,
+                sizeof(options))))
+        {
+            return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1;
+        }
 
-        ComPtr<DmlResourceWrapper> resourceWrapper;
-        uint64_t resourceId = 0;
+        return false;
+    }
 
-        // Find the bucket for this allocation size
-        gsl::index bucketIndex = GetBucketIndexFromSize(size);
+    static uint64_t GetMaxHeapSizeInTiles()
+    {
+        return BucketizedBufferAllocator::kDefaultMaxHeapSizeInTiles;
+    }
 
-        // Some sub allocators have their own rounding mechanisms or alignment requirements of resources
-        uint64_t bucketSize = m_subAllocator->ComputeRequiredSize(GetBucketSizeFromIndex(bucketIndex));
+    BucketizedBufferAllocator::BucketizedBufferAllocator(
+        ID3D12Device* device,
+        ID3D12CommandQueue* queue,
+        const D3D12_HEAP_PROPERTIES& heap_props,
+        D3D12_HEAP_FLAGS heap_flags,
+        D3D12_RESOURCE_FLAGS resource_flags,
+        D3D12_RESOURCE_STATES initial_state)
+        : device_(device),
+        queue_(queue),
+        heap_properties_(heap_props),
+        heap_flags_(heap_flags),
+        resource_flags_(resource_flags),
+        initial_state_(initial_state),
+        tiling_enabled_(GetTilingEnabled(device)),
+        max_heap_size_in_tiles_(GetMaxHeapSizeInTiles())
+    {
+    }
 
-        // Use a pooled resource if the size (post rounding, if requested) matches a bucket size
-        if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == bucketSize)
+    absl::optional<DmlHeapAllocation> BucketizedBufferAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes)
+    {
+        DmlHeapAllocation allocation = {};
+
+        // The allocation may be larger than the requested size to ensure a whole
+        // number of tiles.
+        const uint64_t resource_size_in_tiles =
+            1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        const uint64_t resource_size_in_bytes =
+            resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        auto resource_desc =
+            CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_);
+
+        ID3D12Resource** resources[] = {
+            &allocation.resource_uav_state,
+            &allocation.resource_copy_src_state,
+            &allocation.resource_copy_dst_state};
+
+        D3D12_RESOURCE_STATES states[] = {
+            initial_state_,
+            D3D12_RESOURCE_STATE_COPY_SOURCE,
+            D3D12_RESOURCE_STATE_COPY_DEST};
+
+        for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
         {
-            Bucket* bucket = nullptr;
+            HRESULT create_resource_hr = device_->CreateReservedResource(
+                &resource_desc,
+                states[i],
+                nullptr,
+                IID_PPV_ARGS(resources[i]));
 
-            if (gsl::narrow_cast<gsl::index>(m_pool.size()) <= bucketIndex)
+            if (create_resource_hr == E_OUTOFMEMORY)
             {
-                // Ensure there are sufficient buckets
-                m_pool.resize(bucketIndex + 1);
+                return absl::nullopt;
             }
+            ORT_THROW_IF_FAILED(create_resource_hr);
+        }
+
+        // Reserve enough heaps to store all tiles in the resource.
+        const uint64_t heap_count =
+            1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_;
+        allocation.heaps.resize(heap_count);
 
-            bucket = &m_pool[bucketIndex];
+        // Create heaps and map them to the primary reserved resource.
+        D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {};
+        uint64_t unmapped_resource_tiles = resource_size_in_tiles;
+        for (uint64_t i = 0; i < heap_count; i++)
+        {
+            // Create heap. The last heap of the allocation may have fewer tiles to
+            // avoid wasting space.
+            uint64_t heap_size_in_tiles = std::min<uint64_t>(
+                unmapped_resource_tiles,
+                max_heap_size_in_tiles_);
+            uint64_t heap_size_in_bytes =
+                heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+            auto heap_desc = CD3DX12_HEAP_DESC(
+                heap_size_in_bytes,
+                heap_properties_,
+                0,
+                heap_flags_);
 
-            if (bucket->resources.empty())
+            HRESULT create_heap_hr =
+                device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
+            if (create_heap_hr == E_OUTOFMEMORY)
             {
-                // No more resources in this bucket - allocate a new one
-                resourceWrapper = m_subAllocator->Alloc(bucketSize);
-                resourceId = ++m_currentResourceId;
+                return absl::nullopt;
             }
-            else
+            ORT_THROW_IF_FAILED(create_heap_hr);
+
+            // Source region in the resource to map.
+            D3D12_TILE_REGION_SIZE resource_region_size = {};
+            resource_region_size.NumTiles = static_cast<uint32_t>(heap_size_in_tiles);
+
+            // Target range in the current heap to map.
+            const D3D12_TILE_RANGE_FLAGS tile_range_flags =
+                D3D12_TILE_RANGE_FLAG_NONE;
+            const uint32_t heap_range_start_offset = 0;
+            const uint32_t heap_range_tile_count = static_cast<uint32_t>(heap_size_in_tiles);
+
+            constexpr uint32_t numResourceRegions = 1;
+            constexpr uint32_t numHeapRanges = 1;
+
+            // This is a brand new allocation/resource, so the tile mappings are
+            // guaranteed to be set (on the GPU timeline) by the time any code can
+            // reference the returned resource. We only execute operations on a
+            // single hardware queue so there is no need to wait or signal.
+            //
+            // All resources have identical tile mappings. The repeated call to
+            // UpdateTileMappings on all resources instead of using CopyTileMappings
+            // is intentional: the latter API is not supported by all versions of
+            // PIX.
+            for (auto resource :
+                {allocation.resource_uav_state.Get(),
+                allocation.resource_copy_src_state.Get(),
+                allocation.resource_copy_dst_state.Get()})
             {
-                // Retrieve a resource from the bucket
-                resourceWrapper = std::move(bucket->resources.back().resource);
-                resourceId = bucket->resources.back().resourceId;
-                bucket->resources.pop_back();
+                queue_->UpdateTileMappings(
+                    resource,
+                    numResourceRegions,
+                    &resource_region_start_coordinates,
+                    &resource_region_size,
+                    allocation.heaps[i].Get(),
+                    numHeapRanges,
+                    &tile_range_flags,
+                    &heap_range_start_offset,
+                    &heap_range_tile_count,
+                    D3D12_TILE_MAPPING_FLAG_NONE);
             }
+
+            resource_region_start_coordinates.X += static_cast<uint32_t>(heap_size_in_tiles);
+            unmapped_resource_tiles -= heap_size_in_tiles;
         }
-        else
+
+        assert(unmapped_resource_tiles == 0);
+
+        return allocation;
+    }
+
+    absl::optional<DmlHeapAllocation> BucketizedBufferAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes)
+    {
+        DmlHeapAllocation allocation = {};
+
+        // Create the allocation's sole heap. The allocation may be larger than the
+        // requested size to ensure a whole number of tiles.
+        allocation.heaps.resize(1);
+        D3D12_HEAP_DESC heap_desc =
+            CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_);
+        HRESULT create_heap_hr = device_->CreateHeap(
+            &heap_desc,
+            IID_PPV_ARGS(&allocation.heaps.front()));
+        if (create_heap_hr == E_OUTOFMEMORY)
         {
-            // The allocation will not be pooled.  Construct a new one
-            bucketSize = m_subAllocator->ComputeRequiredSize(size);
-            resourceWrapper = m_subAllocator->Alloc(bucketSize);
-            resourceId = ++m_currentResourceId;
+            return absl::nullopt;
         }
 
-        assert(resourceWrapper != nullptr);
-        assert(resourceWrapper->GetUavResource()->GetDesc().Width == bucketSize);
+        // Create large placed resource that spans the heap.
+        D3D12_RESOURCE_DESC resource_desc =
+            CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_);
+
+        ID3D12Resource** resources[] = {
+            &allocation.resource_uav_state,
+            &allocation.resource_copy_src_state,
+            &allocation.resource_copy_dst_state};
+        D3D12_RESOURCE_STATES states[] = {
+            initial_state_,
+            D3D12_RESOURCE_STATE_COPY_SOURCE,
+            D3D12_RESOURCE_STATE_COPY_DEST};
+
+        for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
+        {
+            HRESULT create_resource_hr = device_->CreatePlacedResource(
+                allocation.heaps.front().Get(),
+                0,
+                &resource_desc,
+                states[i],
+                nullptr,
+                IID_PPV_ARGS(resources[i]));
+            if (create_resource_hr == E_OUTOFMEMORY)
+            {
+                return absl::nullopt;
+            }
+            ORT_THROW_IF_FAILED(create_resource_hr);
+        }
+
+        return allocation;
+    }
+
+    uint64_t BucketizedBufferAllocator::ComputeRequiredSize(size_t size)
+    {
+        const uint64_t resource_size_in_tiles =
+            1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        const uint64_t resource_size_in_bytes =
+            resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+
+        return resource_size_in_bytes;
+    }
+
+    void* BucketizedBufferAllocator::Alloc(size_t size_in_bytes)
+    {
+        // For some reason lotus likes requesting 0 bytes of memory
+        size_in_bytes = std::max<size_t>(1, size_in_bytes);
+
+        // The D3D12 device is thread-safe so we don't need to hold the lock while
+        // creating an allocation.
+        absl::optional<DmlHeapAllocation> allocation =
+            tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes)
+                            : TryCreateUntiledAllocation(size_in_bytes);
+
+        ORT_THROW_HR_IF(E_INVALIDARG, !allocation);
 
+        // We need to access (mutable) state after this point, so we need to lock
+        std::unique_lock<std::mutex> lock(mutex_);
+
+        absl::optional<uint32_t> allocationId = TryReserveAllocationID();
+        ORT_THROW_HR_IF(E_INVALIDARG, !allocationId);
+
+        auto resourceWrapper = wil::MakeOrThrow<DmlReservedResourceWrapper>(std::move(*allocation));
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
             this,
             ++m_currentAllocationId,
-            resourceId,
+            ++m_currentResourceId,
             resourceWrapper.Get(),
-            size
+            size_in_bytes
         );
 
+        allocations_by_id_.emplace(*allocationId, allocInfo);
+
+        lock.unlock();
+
     #if _DEBUG
         m_outstandingAllocationsById[allocInfo->GetId()] = allocInfo.Get();
     #endif
 
-        return allocInfo.Detach();
+        // DML only has a single device in ORT at the moment
+        const uint64_t device_id = 0;
+        const uint64_t offset = 0;
+        return TaggedPointer::Pack(device_id, *allocationId, offset);
     }
 
-    void BucketizedBufferAllocator::Free(void* p)
+    void BucketizedBufferAllocator::Free(void* ptr)
     {
-        // Release Lotus's reference on the allocation.  The allocation
-        // also inherits IUnknown, and once its final reference reaches zero
-        // it will call FreeResource
-        ComPtr<AllocationInfo> allocInfo;
-        allocInfo.Attach(static_cast<AllocationInfo*>(p));
+        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
+
+        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
+        ORT_THROW_HR_IF(E_INVALIDARG, tagged_ptr.offset != 0);
+
+        // We need to access (mutable) state after this point, so we need to lock
+        std::unique_lock<std::mutex> lock(mutex_);
+
+        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+        ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
+
+        ReleaseAllocationID(tagged_ptr.allocation_id);
+
+        // Frees the ID3D12Heap
+        allocations_by_id_.erase(it);
     }
 
-    void BucketizedBufferAllocator::FreeResource(void* p, uint64_t pooledResourceId)
+   void BucketizedBufferAllocator::FreeResource(void* p, uint64_t pooledResourceId)
     {
         AllocationInfo *allocInfo = static_cast<AllocationInfo*>(p);
 
@@ -165,31 +338,12 @@ namespace Dml
             ORT_THROW_HR(E_INVALIDARG);
         }
 
-        // Free the resource to the pool if its size matches a bucket size
-        gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
-        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetUavResource()->GetDesc().Width)
-        {
-            if (gsl::narrow_cast<gsl::index>(m_pool.size()) <= bucketIndex)
-            {
-                // Ensure there are sufficient buckets
-                m_pool.resize(bucketIndex + 1);
-            }
-
-            // Return the resource to the bucket
-            Bucket* bucket = &m_pool[bucketIndex];
-
-            Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
-            bucket->resources.push_back(resource);
-        }
-        else
-        {
-            // Free the underlying allocation once queued work has completed.
+        // Free the underlying allocation once queued work has completed.
 #ifdef _GAMING_XBOX
-            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get());
+        m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get());
 #else
-            m_context->QueueReference(allocInfo->DetachResourceWrapper().Get());
+        m_context->QueueReference(allocInfo->DetachResourceWrapper().Get());
 #endif
-        }
 
     #if _DEBUG
         assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
@@ -199,58 +353,100 @@ namespace Dml
         // The allocation info is already destructing at this point
     }
 
-
-    const AllocationInfo* BucketizedBufferAllocator::DecodeDataHandle(const void* opaqueHandle)
+    absl::optional<uint32_t> BucketizedBufferAllocator::TryReserveAllocationID()
     {
-        if (opaqueHandle == nullptr)
+        // The mutex must already be held
+        assert(!mutex_.try_lock());
+
+        if (!free_allocation_ids_.empty())
         {
-            // There is no memory allocated which needs to be decoded.
-            ORT_THROW_HR(E_INVALIDARG);
+            // Return a free ID from the pool
+            uint32_t id = free_allocation_ids_.back();
+            free_allocation_ids_.pop_back();
+            return id;
         }
-        const auto* allocInfo = static_cast<const AllocationInfo*>(opaqueHandle);
 
-        auto owner = allocInfo->GetOwner();
-        //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource
-        if (owner != nullptr && owner != this)
+        static constexpr uint32_t kMaxAllocationID =
+            (1 << TaggedPointer::kAllocationIDBits) - 1;
+        if (current_allocation_id_ == kMaxAllocationID)
         {
-            // This allocation doesn't belong to this allocator!
-            ORT_THROW_HR(E_INVALIDARG);
+            // We've reached the maximum number of allocations!
+            return absl::nullopt;
         }
 
-        return allocInfo;
+        ++current_allocation_id_;
+        return current_allocation_id_;
     }
 
-    void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
+    void BucketizedBufferAllocator::ReleaseAllocationID(uint32_t id)
     {
-        m_defaultRoundingMode = roundingMode;
+        // The mutex must already be held
+        assert(!mutex_.try_lock());
+
+        // Add it to the pool of free IDs
+        free_allocation_ids_.push_back(id);
     }
 
-    CPUAllocator::CPUAllocator(OrtMemType memType)
-        : onnxruntime::IAllocator(
-            OrtMemoryInfo(
-                "DML CPU",
-                OrtAllocatorType::OrtDeviceAllocator,
-                OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
-                0,
-                memType
-            )
-        )
+    D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(
+        const void* ptr,
+        uint64_t size_in_bytes)
     {
+        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
+
+        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
+
+        // We need to access (mutable) state after this point, so we need to lock
+        std::unique_lock<std::mutex> lock(mutex_);
+
+        // Find the allocation corresponding to this pointer
+        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+        ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
+
+        return D3D12BufferRegion(
+            tagged_ptr.offset,
+            size_in_bytes,
+            it->second->GetUavResource(),
+            it->second->GetCopySrcResource(),
+            it->second->GetCopyDstResource());
     }
 
-    void* CPUAllocator::Alloc(size_t size)
+    ComPtr<DmlManagedBufferRegion> BucketizedBufferAllocator::CreateManagedBufferRegion(
+        const void* ptr,
+        uint64_t size_in_bytes)
     {
-        if (size <= 0)
-        {
-            return nullptr;
-        }
-        void* p = malloc(size);
-        return p;
+        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
+
+        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
+
+        // We need to access (mutable) state after this point, so we need to lock
+        std::unique_lock<std::mutex> lock(mutex_);
+
+        // Find the allocation corresponding to this pointer
+        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+        ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
+
+        D3D12BufferRegion bufferRegion(
+            tagged_ptr.offset,
+            size_in_bytes,
+            it->second->GetUavResource(),
+            it->second->GetCopySrcResource(),
+            it->second->GetCopyDstResource());
+
+        return wil::MakeOrThrow<DmlManagedBufferRegion>(it->second, std::move(bufferRegion));
     }
 
-    void CPUAllocator::Free(void* p)
+    AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const void* ptr)
     {
-        free(p);
+        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
+
+        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
+
+        // We need to access (mutable) state after this point, so we need to lock
+        std::unique_lock<std::mutex> lock(mutex_);
+
+        // Find the allocation corresponding to this pointer
+        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+        return it->second.Get();
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 254631652cc47..f21d174500fcb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -3,116 +3,76 @@
 
 #pragma once
 
-#include "core/framework/allocator.h"
 #include "ExecutionContext.h"
-#include "DmlResourceWrapper.h"
+#include "DmlAllocationInfo.h"
+#include "DmlBufferRegion.h"
 
 namespace Dml
 {
-    class D3D12HeapAllocator;
-
-    class CPUAllocator : public onnxruntime::IAllocator
-    {
-    public:
-        explicit CPUAllocator(OrtMemType memType);
-
-        void* Alloc(size_t size) override;
-        void Free(void* p) override;
-    };
-
+    class BucketizedBufferAllocator;
     class BucketizedBufferAllocator;
 
-    class AllocationInfo : public Microsoft::WRL::RuntimeClass<
-        Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
+    // An allocator that makes logically contiguous allocations backed by D3D heaps.
+    //
+    // Heaps must fit entirely in either local or non-local memory. Larger heaps
+    // have a greater chance of getting demoted into non-local memory, which can be
+    // disastrous for performance. This problem is compounded by the fact that heaps
+    // may be demoted even if overall local memory usage is within the process'
+    // budget. Heaps are not necessarily mappable to discontiguous regions of
+    // physical memory, which means physical memory fragmentation *may* make it
+    // extremely difficult to accommodate larger heaps.
+    //
+    // On D3D hardware that supports tiled resource tier 1+ this class implements
+    // large allocations through tiling. Each allocation is backed by however many
+    // small heaps are necessary to cover the requested allocation size. Buffer
+    // regions retrieved through this allocator are reserved resources that span the
+    // full collection of heaps assigned to an individual allocation. Tile mappings
+    // are static.
+    //
+    // On hardware that doesn't support tiled resources each allocation is backed by
+    // a single heap. Buffer regions retrieved through this allocator are placed
+    // resources that span the full heap assigned to an individual allocation. In
+    // this case it is better make more but smaller allocations (resulting in
+    // smaller heaps); this fallback path is only retained as a last resort for
+    // older hardware.
+    class BucketizedBufferAllocator
     {
     public:
-        AllocationInfo(
-            BucketizedBufferAllocator* owner,
-            size_t id,
-            uint64_t pooledResourceId,
-            DmlResourceWrapper* resourceWrapper,
-            size_t requestedSize)
-            : m_owner(owner)
-            , m_allocationId(id)
-            , m_pooledResourceId(pooledResourceId)
-            , m_resourceWrapper(resourceWrapper)
-            , m_requestedSize(requestedSize)
-        {}
-
-        ~AllocationInfo();
-
-        BucketizedBufferAllocator* GetOwner() const
-        {
-            return m_owner;
-        }
-
-        ID3D12Resource* GetUavResource() const
-        {
-            return m_resourceWrapper->GetUavResource();
-        }
-
-        ID3D12Resource* GetCopySrcResource() const
-        {
-            return m_resourceWrapper->GetCopySrcResource();
-        }
-
-        ID3D12Resource* GetCopyDstResource() const
-        {
-            return m_resourceWrapper->GetCopyDstResource();
-        }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const
-        {
-            return m_resourceWrapper->GetDefaultUavState();
-        }
-
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const
-        {
-            return m_resourceWrapper->GetDefaultCopySrcState();
-        }
-
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const
-        {
-            return m_resourceWrapper->GetDefaultCopyDstState();
-        }
-
-        ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
-        {
-            return std::move(m_resourceWrapper);
-        }
-
-        size_t GetRequestedSize() const
-        {
-            return m_requestedSize;
-        }
+        // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
+        // is 64KB. A default size of 512 tiles (32MB) does a good job of handling
+        // local video memory fragmentation without requiring lots of heaps.
+        static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512;
 
-        size_t GetId() const
-        {
-            return m_allocationId;
-        }
-
-        uint64_t GetPooledResourceId() const
-        {
-            return m_pooledResourceId;
-        }
-
-    private:
-        BucketizedBufferAllocator* m_owner;
-        size_t m_allocationId; // For debugging purposes
-        uint64_t m_pooledResourceId = 0;
-        ComPtr<DmlResourceWrapper> m_resourceWrapper;
-
-        // The size requested during Alloc(), which may be smaller than the physical resource size
-        size_t m_requestedSize;
-    };
+        BucketizedBufferAllocator(
+            ID3D12Device* device,
+            ID3D12CommandQueue* queue,
+            const D3D12_HEAP_PROPERTIES& heap_props,
+            D3D12_HEAP_FLAGS heap_flags,
+            D3D12_RESOURCE_FLAGS resource_flags,
+            D3D12_RESOURCE_STATES initial_state);
+
+        // Creates a reserved or placed resource buffer over the given memory range.
+        // The physical D3D12 resource may be larger than the requested size, so
+        // callers must ensure to use the offset/size returned in the
+        // D3D12BufferRegion else risk out of bounds access. Note that in practice
+        // the ID3D12Resource is cached, so this call typically has a lower cost
+        // than a call to ID3D12Device::CreatePlacedResource or
+        // CreateReservedResource.
+        D3D12BufferRegion CreateBufferRegion(
+            const void* ptr,
+            uint64_t size_in_bytes);
+
+        ComPtr<DmlManagedBufferRegion> CreateManagedBufferRegion(
+            const void* ptr,
+            uint64_t size_in_bytes);
+
+        AllocationInfo* GetAllocationInfo(const void* ptr);
+
+        void* Alloc(size_t size_in_bytes);
+        void Free(void* ptr);
+        uint64_t ComputeRequiredSize(size_t size);
+        bool TilingEnabled() const { return tiling_enabled_; };
 
-    // Implements a Lotus allocator for D3D12 heap buffers, using a bucket allocation strategy. The allocator
-    // maintains a set of fixed-size buckets, with each bucket containing one or more D3D12 buffers of that fixed size.
-    // All requested allocation sizes are rounded up to the nearest bucket size, which ensures minimal fragmentation
-    // while providing an upper bound on the amount of memory "wasted" with each allocation.
-    class BucketizedBufferAllocator : public onnxruntime::IAllocator
-    {
-    public:
         ~BucketizedBufferAllocator();
 
         // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties,
@@ -120,18 +80,10 @@ namespace Dml
         BucketizedBufferAllocator(
             ID3D12Device* device,
             std::shared_ptr<ExecutionContext> context,
-            std::unique_ptr<D3D12HeapAllocator>&& subAllocator);
-
-        // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc.
-        const AllocationInfo* DecodeDataHandle(const void* opaqueHandle);
+            std::unique_ptr<BucketizedBufferAllocator>&& subAllocator);
 
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
 
-    public: // onnxruntime::IAllocator
-        void* Alloc(size_t size, AllocatorRoundingMode roundingMode);
-        void* Alloc(size_t size) final;
-        void Free(void* p) final;
-
     private:
         static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
 
@@ -152,12 +104,6 @@ namespace Dml
         static gsl::index GetBucketIndexFromSize(uint64_t size);
         static uint64_t GetBucketSizeFromIndex(gsl::index index);
 
-        AllocationInfo* DecodeDataHandleInternal(void* opaqueHandle)
-        {
-            // Implement in terms of const version
-            return const_cast<AllocationInfo*>(DecodeDataHandle(static_cast<const void*>(opaqueHandle)));
-        }
-
         friend class AllocationInfo;
         void FreeResource(void* p, uint64_t resourceId);
 
@@ -168,12 +114,47 @@ namespace Dml
         uint64_t m_currentResourceId = 0;
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled;
         std::shared_ptr<ExecutionContext> m_context;
-        std::unique_ptr<D3D12HeapAllocator> m_subAllocator;
+        std::unique_ptr<BucketizedBufferAllocator> m_subAllocator;
 
     #if _DEBUG
         // Useful for debugging; keeps track of all allocations that haven't been freed yet
         std::map<size_t, AllocationInfo*> m_outstandingAllocationsById;
     #endif
+
+        std::mutex mutex_;
+
+        Microsoft::WRL::ComPtr<ID3D12Device> device_;
+        Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue_;
+        const D3D12_HEAP_PROPERTIES heap_properties_;
+        const D3D12_HEAP_FLAGS heap_flags_;
+        const D3D12_RESOURCE_FLAGS resource_flags_;
+        const D3D12_RESOURCE_STATES initial_state_;
+        bool tiling_enabled_;
+        uint64_t max_heap_size_in_tiles_;
+
+        // The largest allocation ID we've returned so far (or 0 if we've never done
+        // so). Note that our allocation IDs start at 1 (not 0) to ensure that it
+        // isn't possible for a valid allocation to have a pointer value of
+        // 0x00000000.
+        uint32_t current_allocation_id_ = 0;
+
+        // A list of unused allocation IDs. This is for re-use of IDs once they get
+        // freed. We only bump the max_allocation_id_ once there are no more free
+        // IDs.
+        std::vector<uint32_t> free_allocation_ids_;
+
+        absl::optional<DmlHeapAllocation> TryCreateTiledAllocation(uint64_t size_in_bytes);
+        absl::optional<DmlHeapAllocation> TryCreateUntiledAllocation(uint64_t size_in_bytes);
+
+        friend class D3D12BufferRegion;
+
+        absl::flat_hash_map<uint32_t, Microsoft::WRL::ComPtr<AllocationInfo>> allocations_by_id_;
+
+        // Retrieves a free allocation ID, or nullopt if no more IDs are available.
+        absl::optional<uint32_t> TryReserveAllocationID();
+
+        // Releases an allocation ID back to the pool of IDs.
+        void ReleaseAllocationID(uint32_t id);
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
new file mode 100644
index 0000000000000..044e9e854d700
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlAllocationInfo.h"
+#include "BucketizedBufferAllocator.h"
+
+namespace Dml
+{
+
+    AllocationInfo::~AllocationInfo()
+    {
+        if (m_owner)
+        {
+            m_owner->FreeResource(this, m_pooledResourceId);
+        }
+    }
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
new file mode 100644
index 0000000000000..977de7c4887e2
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "DmlReservedResourceWrapper.h"
+
+namespace Dml
+{
+    class BucketizedBufferAllocator;
+
+    class AllocationInfo : public Microsoft::WRL::RuntimeClass<
+        Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
+    {
+    public:
+        AllocationInfo(
+            BucketizedBufferAllocator* owner,
+            size_t id,
+            uint64_t pooledResourceId,
+            DmlResourceWrapper* resourceWrapper,
+            size_t requestedSize)
+            : m_owner(owner)
+            , m_allocationId(id)
+            , m_pooledResourceId(pooledResourceId)
+            , m_resourceWrapper(resourceWrapper)
+            , m_requestedSize(requestedSize)
+        {}
+
+        ~AllocationInfo();
+
+        BucketizedBufferAllocator* GetOwner() const
+        {
+            return m_owner;
+        }
+
+        ID3D12Resource* GetUavResource() const
+        {
+            return m_resourceWrapper->GetUavResource();
+        }
+
+        ID3D12Resource* GetCopySrcResource() const
+        {
+            return m_resourceWrapper->GetCopySrcResource();
+        }
+
+        ID3D12Resource* GetCopyDstResource() const
+        {
+            return m_resourceWrapper->GetCopyDstResource();
+        }
+
+        ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
+        {
+            return std::move(m_resourceWrapper);
+        }
+
+        size_t GetRequestedSize() const
+        {
+            return m_requestedSize;
+        }
+
+        size_t GetId() const
+        {
+            return m_allocationId;
+        }
+
+        uint64_t GetPooledResourceId() const
+        {
+            return m_pooledResourceId;
+        }
+
+    private:
+        BucketizedBufferAllocator* m_owner;
+        size_t m_allocationId; // For debugging purposes
+        uint64_t m_pooledResourceId = 0;
+        Microsoft::WRL::ComPtr<DmlResourceWrapper> m_resourceWrapper;
+
+        // The size requested during Alloc(), which may be smaller than the physical resource size
+        size_t m_requestedSize;
+    };
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
new file mode 100644
index 0000000000000..458a65e63c0c4
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+#include "BucketizedBufferAllocator.h"
+
+namespace Dml
+{
+    class DmlBfcAllocator : public onnxruntime::IAllocator
+    {
+    public:
+        DmlBfcAllocator(BucketizedBufferAllocator* subAllocator)
+        : onnxruntime::IAllocator(
+            OrtMemoryInfo(
+                "DML",
+                OrtAllocatorType::OrtDeviceAllocator,
+                OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
+            )
+        ),
+        m_subAllocator(subAllocator) {}
+
+        void* Alloc(size_t size_in_bytes) { return m_subAllocator->Alloc(size_in_bytes); }
+        void Free(void* ptr) { m_subAllocator->Free(ptr); }
+    private:
+        BucketizedBufferAllocator* m_subAllocator;
+    };
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
new file mode 100644
index 0000000000000..3240042b5b6a6
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
@@ -0,0 +1,120 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlBufferRegion.h"
+
+namespace Dml
+{
+
+    D3D12BufferRegion::D3D12BufferRegion(
+        uint64_t offset,
+        uint64_t size_in_bytes,
+        ID3D12Resource* resource_uav_state,
+        ID3D12Resource* resource_copy_src_state,
+        ID3D12Resource* resource_copy_dst_state)
+        : resource_uav_state_(resource_uav_state),
+        resource_copy_src_state_(resource_copy_src_state),
+        resource_copy_dst_state_(resource_copy_dst_state),
+        offset_(offset),
+        size_in_bytes_(size_in_bytes)
+    {
+        // Get a raw pointer to the first non-null resource passed in. At least one
+        // resource must be provided.
+        first_valid_resource_ = resource_uav_state_;
+        if (!first_valid_resource_)
+        {
+            first_valid_resource_ = resource_copy_src_state_;
+        }
+        if (!first_valid_resource_)
+        {
+            first_valid_resource_ = resource_copy_dst_state_;
+        }
+        ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr);
+
+        // Regions cannot be empty.
+        ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
+
+        // Regions cannot extend beyond the size of the resource.
+        uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
+        ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
+        ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
+
+        // All three resources, if provided, must be identical aside from state.
+        assert(
+            first_valid_resource_->GetDesc().Dimension ==
+            D3D12_RESOURCE_DIMENSION_BUFFER);
+        assert(
+            !resource_uav_state ||
+            (resource_uav_state->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_uav_state->GetDesc().Width == buffer_size));
+        assert(
+            !resource_copy_src_state_ ||
+            (resource_copy_src_state_->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_copy_src_state_->GetDesc().Width == buffer_size));
+        assert(
+            !resource_copy_dst_state_ ||
+            (resource_copy_dst_state_->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_copy_dst_state_->GetDesc().Width == buffer_size));
+    }
+
+    D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
+    {
+        std::swap(this->resource_uav_state_, that.resource_uav_state_);
+        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->offset_, that.offset_);
+        std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->first_valid_resource_, that.first_valid_resource_);
+    }
+
+    D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
+    {
+        std::swap(this->resource_uav_state_, that.resource_uav_state_);
+        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->offset_, that.offset_);
+        std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->first_valid_resource_, that.first_valid_resource_);
+        return *this;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
+    {
+        return resource_uav_state_;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const
+    {
+        return resource_copy_src_state_;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const
+    {
+        return resource_copy_dst_state_;
+    }
+
+    uint64_t D3D12BufferRegion::Offset() const
+    {
+        return first_valid_resource_ ? offset_ : 0;
+    }
+
+    uint64_t D3D12BufferRegion::SizeInBytes() const
+    {
+        return first_valid_resource_ ? size_in_bytes_ : 0;
+    }
+
+    DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
+    {
+        if (!resource_uav_state_)
+        {
+            return DML_BUFFER_BINDING{};
+        }
+
+        return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
+    }
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
new file mode 100644
index 0000000000000..29a6bf6f7c775
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -0,0 +1,79 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace Dml
+{
+    // Represents a region of a D3D12 buffer resource. A buffer region has an
+    // underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in
+    // bytes from the beginning of that buffer, and a size in bytes of the region.
+    class D3D12BufferRegion
+    {
+    public:
+        D3D12BufferRegion() = default;
+
+        // References a region of a buffer. The respective ID3D12Resource objects
+        // must be in the appropriate states. Each resource is optional, but if more
+        // than one are provided they must map to the same region of memory.
+        D3D12BufferRegion(
+            uint64_t offset,
+            uint64_t size_in_bytes,
+            ID3D12Resource* resource_uav_state,
+            ID3D12Resource* resource_copy_src_state,
+            ID3D12Resource* resource_copy_dst_state);
+
+        // Move-only
+        D3D12BufferRegion(const D3D12BufferRegion&) = delete;
+        D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete;
+        D3D12BufferRegion(D3D12BufferRegion&&);
+        D3D12BufferRegion& operator=(D3D12BufferRegion&&);
+
+        ID3D12Resource* ResourceInUavState() const;
+
+        // NOTE: may be any state that is valid as a copy source (COPY_SRC,
+        // GENERIC_READ, or COMMON).
+        ID3D12Resource* ResourceInCopySrcState() const;
+
+        ID3D12Resource* ResourceInCopyDstState() const;
+
+        uint64_t Offset() const;
+        uint64_t SizeInBytes() const;
+
+        DML_BUFFER_BINDING GetBufferBinding() const;
+
+        explicit operator bool() const { return first_valid_resource_ != nullptr; }
+
+        // Creates a subregion at an offset from the start of this region. If no
+        // size is provided the region runs to the end of the current region.
+        inline D3D12BufferRegion Subregion(
+            uint64_t offset,
+            uint64_t size_in_bytes = 0) const
+        {
+            // start of subregion must be within current region
+            ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
+            size_in_bytes =
+                size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            // end of subregion must be within current region
+            ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
+
+            return D3D12BufferRegion(
+                offset_ + offset,
+                size_in_bytes,
+                resource_uav_state_,
+                resource_copy_src_state_,
+                resource_copy_dst_state_);
+        }
+
+    private:
+        ID3D12Resource* resource_uav_state_ = nullptr;
+        ID3D12Resource* resource_copy_src_state_ = nullptr;
+        ID3D12Resource* resource_copy_dst_state_ = nullptr;
+        uint64_t offset_ = 0;
+        uint64_t size_in_bytes_ = 0;
+
+        // Pointer to the first resource above that isn't null.
+        ID3D12Resource* first_valid_resource_ = nullptr;
+    };
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 7f2fdafbbeb60..22161a6a58cbf 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -5,6 +5,7 @@
 #include "DmlCommandRecorder.h"
 #include "CommandQueue.h"
 #include "BucketizedBufferAllocator.h"
+#include "absl/cleanup/cleanup.h"
 
 using namespace Dml;
 
@@ -22,9 +23,14 @@ DmlCommandRecorder::DmlCommandRecorder(
     ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder)));
 }
 
-void DmlCommandRecorder::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
+void DmlCommandRecorder::SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator)
 {
-    m_bufferAllocator = allocator;
+    m_allocator = allocator;
+}
+
+void DmlCommandRecorder::SetSubAllocator(std::weak_ptr<BucketizedBufferAllocator> subAllocator)
+{
+    m_subAllocator = subAllocator;
 }
 
 void DmlCommandRecorder::InitializeOperator(
@@ -57,26 +63,25 @@ void DmlCommandRecorder::InitializeOperator(
     UINT64 temporaryResourceSize = initBindingProps.TemporaryResourceSize;
     if (temporaryResourceSize > 0)
     {
-        auto allocator = m_bufferAllocator.lock();
+        auto allocator = m_allocator.lock();
 
         // Allocate and immediately free a temporary buffer. The buffer resource will still be
         // alive (managed by the pool); freeing allows the resource to be shared with other operators.
-        void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize), AllocatorRoundingMode::Enabled);
-
-
-
+        void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize));
         if (!tempResourceHandle)
         {
             ORT_THROW_HR(E_OUTOFMEMORY);
         }
+        absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); });
 
-        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource();
-        allocator->Free(tempResourceHandle);
+        auto subAllocator = m_subAllocator.lock();
+        auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize);
 
         // Bind the temporary resource.
-        DML_BUFFER_BINDING bufferBinding = { buffer, 0, temporaryResourceSize };
+        DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding();
         DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
         bindingTable->BindTemporaryResource(&bindingDesc);
+        allocator->Free(tempResourceHandle);
     }
 
     // Bind inputs, if provided.
@@ -138,21 +143,22 @@ void DmlCommandRecorder::ExecuteOperator(
     UINT64 temporaryResourceSize = execBindingProps.TemporaryResourceSize;
     if (temporaryResourceSize > 0)
     {
-        auto allocator = m_bufferAllocator.lock();
+        auto allocator = m_allocator.lock();
 
         // Allocate and immediately free a temporary buffer. The buffer resource will still be
         // alive (managed by the pool); freeing allows the resource to be shared with other operators.
-        void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize), AllocatorRoundingMode::Enabled);
+        void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize));
         if (!tempResourceHandle)
         {
             ORT_THROW_HR(E_OUTOFMEMORY);
         }
+        absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); });
 
-        ID3D12Resource* buffer = allocator->DecodeDataHandle(tempResourceHandle)->GetUavResource();
-        allocator->Free(tempResourceHandle);
+        auto subAllocator = m_subAllocator.lock();
+        auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize);
 
         // Bind the temporary resource.
-        DML_BUFFER_BINDING bufferBinding = { buffer, 0, temporaryResourceSize };
+        DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding();
         DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
         bindingTable->BindTemporaryResource(&bindingDesc);
     }
@@ -196,6 +202,7 @@ void DmlCommandRecorder::CopyBufferRegion(
 
 void DmlCommandRecorder::FillBufferWithPattern(
     ID3D12Resource* dstBuffer,
+    uint64_t offset,
     gsl::span<const std::byte> value /* Data type agnostic value, treated as raw bits */)
 {
     // The fill pattern for ClearUnorderedAccessViewUint is 16 bytes.
@@ -226,6 +233,7 @@ void DmlCommandRecorder::FillBufferWithPattern(
     D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
     uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
     uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
+    uavDesc.Buffer.FirstElement = gsl::narrow<uint32_t>(offset / sizeof(uint32_t));
     uavDesc.Buffer.NumElements = gsl::narrow<uint32_t>(dstBuffer->GetDesc().Width / sizeof(uint32_t));
     uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index 7ad7032317d77..2bf23062a49f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -5,6 +5,7 @@
 
 #include "ICommandRecorder.h"
 #include "CommandAllocatorRing.h"
+#include "core/framework/allocator.h"
 
 namespace Dml
 {
@@ -16,7 +17,7 @@ namespace Dml
     public:
         DmlCommandRecorder(
             ID3D12Device* d3dDevice,
-            IDMLDevice* device, 
+            IDMLDevice* device,
             std::shared_ptr<CommandQueue> commandQueue);
 
         void InitializeOperator(
@@ -39,6 +40,7 @@ namespace Dml
 
         void FillBufferWithPattern(
             ID3D12Resource* dstBuffer,
+            uint64_t offset,
             gsl::span<const std::byte> value /* Data type agnostic value, treated as raw bits */);
 
         void ExecuteCommandList(
@@ -47,14 +49,15 @@ namespace Dml
             _Out_ uint64_t* completionValue);
 
         ComPtr<ID3D12GraphicsCommandList> GetCommandList();
-        
+
         void ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers);
         void AddUAVBarrier();
 
         void Open() final;
         void CloseAndExecute() final;
-        
-        void SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
+
+        void SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator);
+        void SetSubAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
 
         bool HasUnsubmittedWork() override
         {
@@ -81,7 +84,8 @@ namespace Dml
         ID3D12DescriptorHeap* m_currentDescriptorHeap = nullptr;
 
         // The weak pointer avoids a circular reference from context->recorder->allocator->context
-        std::weak_ptr<BucketizedBufferAllocator> m_bufferAllocator;
+        std::weak_ptr<onnxruntime::IAllocator> m_allocator;
+        std::weak_ptr<BucketizedBufferAllocator> m_subAllocator;
 
         CommandAllocatorRing<2> m_commandAllocatorRing;
 
@@ -89,7 +93,7 @@ namespace Dml
         ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
         bool m_operationsRecordedInCurrentCommandList = false;
 
-        // Command lists which have been batched up for execution.  The values in 
+        // Command lists which have been batched up for execution.  The values in
         // m_pendingCommandListsCacheable indicate whether they can be moved into this
         // class's cache after execution, versus if they belong to the caller and were
         // passed to ExecuteCommandList.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp
new file mode 100644
index 0000000000000..a9ba854a45747
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlCpuAllocator.h"
+
+namespace Dml
+{
+
+DmlCpuAllocator::DmlCpuAllocator(OrtMemType memType)
+    : onnxruntime::IAllocator(
+        OrtMemoryInfo(
+            "DML CPU",
+            OrtAllocatorType::OrtDeviceAllocator,
+            OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
+            0,
+            memType
+        )
+    )
+{
+}
+
+void* DmlCpuAllocator::Alloc(size_t size)
+{
+    if (size <= 0)
+    {
+        return nullptr;
+    }
+    void* p = malloc(size);
+    return p;
+}
+
+void DmlCpuAllocator::Free(void* p)
+{
+    free(p);
+}
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h
new file mode 100644
index 0000000000000..2f81975d2c4cd
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCpuAllocator.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+
+namespace Dml
+{
+
+class DmlCpuAllocator : public onnxruntime::IAllocator
+{
+public:
+    explicit DmlCpuAllocator(OrtMemType memType);
+
+    void* Alloc(size_t size) override;
+    void Free(void* p) override;
+};
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
new file mode 100644
index 0000000000000..554a4dca8e550
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+#include "BucketizedBufferAllocator.h"
+
+namespace Dml
+{
+    class DmlGpuAllocator : public onnxruntime::IAllocator
+    {
+    public:
+        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, BucketizedBufferAllocator* subAllocator)
+        : onnxruntime::IAllocator(
+            OrtMemoryInfo(
+                "DML",
+                OrtAllocatorType::OrtDeviceAllocator,
+                OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
+            )
+        ),
+        m_bfcAllocator(bfcAllocator),
+        m_subAllocator(subAllocator) {}
+
+        void* Alloc(size_t size_in_bytes) { return m_bfcAllocator->Alloc(size_in_bytes); }
+        void Free(void* ptr) { m_bfcAllocator->Free(ptr); }
+
+        BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator; }
+
+    private:
+        // This allocator is managed by ORT and should be used to allocate/free memory in order
+        // to utilize the BFC acapabilities
+        onnxruntime::IAllocator* m_bfcAllocator;
+
+        // This allocator is specific to DML and is used to decode the opaque data returned by the BFC
+        // allocator into objects that DML understands
+        BucketizedBufferAllocator* m_subAllocator;
+    };
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 9d0ba9dc7ea51..890c5aa1ae384 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -94,11 +94,11 @@ namespace DmlGraphFusionHelper
         ID3D12Resource** resource,
         uint64_t* allocId)
     {
-        IUnknown* allocationUnk = static_cast<IUnknown*>(const_cast<void*>(tensor->DataRaw()));
+        void* opaqueData = const_cast<void*>(tensor->DataRaw());
         Microsoft::WRL::ComPtr<IUnknown> resourceUnk;
-        winmlProvider->GetABIDataInterface(false, allocationUnk, &resourceUnk);
+        winmlProvider->GetABIDataInterface(opaqueData, &resourceUnk);
 
-        *allocId = winmlProvider->TryGetPooledAllocationId(allocationUnk, 0);
+        *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0);
 
         ORT_THROW_IF_FAILED(resourceUnk->QueryInterface(resource));
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
new file mode 100644
index 0000000000000..6de78a47b6d8b
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace Dml
+{
+    struct DmlHeapAllocation
+    {
+        Microsoft::WRL::ComPtr<ID3D12Heap> heap;
+
+        // Heaps backing the memory for the allocation. If tiling is supported
+        // an allocation may comprise multiple heaps. If tiling is not supported
+        // an allocation will only have a single heap.
+        std::vector<Microsoft::WRL::ComPtr<ID3D12Heap>> heaps;
+
+        // Resources created over this allocation's heaps. All three resources
+        // are identical aside from being fixed in a single resource state: UAV,
+        // COPY_SRC, and COPY_DST respectively. The purpose of duplicate
+        // resources is to enable overlapping resources in different states for
+        // copying data. Most callers will not (and should not) interact
+        // directly with these resources; all three are wrapped by the buffer
+        // regions returned from this allocator, and the appropriate resource
+        // will be used automatically when performing buffer copies.
+        Microsoft::WRL::ComPtr<ID3D12Resource> resource_uav_state;
+        Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_src_state;
+        Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_dst_state;
+    };
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
deleted file mode 100644
index bdda99ae6f91a..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "precomp.h"
-#include "DmlHeapAllocator.h"
-#include "DmlReservedResourceWrapper.h"
-
-namespace Dml
-{
-
-static bool GetTilingEnabled(ID3D12Device* device)
-{
-    D3D12_FEATURE_DATA_D3D12_OPTIONS options = {};
-    if (SUCCEEDED(device->CheckFeatureSupport(
-            D3D12_FEATURE_D3D12_OPTIONS,
-            &options,
-            sizeof(options))))
-    {
-        return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1;
-    }
-
-    return false;
-}
-
-static uint64_t GetMaxHeapSizeInTiles()
-{
-    return D3D12HeapAllocator::kDefaultMaxHeapSizeInTiles;
-}
-
-D3D12HeapAllocator::D3D12HeapAllocator(
-    ID3D12Device* device,
-    ID3D12CommandQueue* queue,
-    const D3D12_HEAP_PROPERTIES& heap_props,
-    D3D12_HEAP_FLAGS heap_flags,
-    D3D12_RESOURCE_FLAGS resource_flags,
-    D3D12_RESOURCE_STATES initial_state)
-    : device_(device),
-      queue_(queue),
-      heap_properties_(heap_props),
-      heap_flags_(heap_flags),
-      resource_flags_(resource_flags),
-      initial_state_(initial_state),
-      tiling_enabled_(GetTilingEnabled(device)),
-      max_heap_size_in_tiles_(GetMaxHeapSizeInTiles())
-{
-}
-
-absl::optional<Allocation> D3D12HeapAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes)
-{
-    Allocation allocation = {};
-
-    // The allocation may be larger than the requested size to ensure a whole
-    // number of tiles.
-    const uint64_t resource_size_in_tiles =
-        1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    const uint64_t resource_size_in_bytes =
-        resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    auto resource_desc =
-        CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_);
-
-    ID3D12Resource** resources[] = {
-        &allocation.resource_uav_state,
-        &allocation.resource_copy_src_state,
-        &allocation.resource_copy_dst_state};
-
-    D3D12_RESOURCE_STATES states[] = {
-        initial_state_,
-        D3D12_RESOURCE_STATE_COPY_SOURCE,
-        D3D12_RESOURCE_STATE_COPY_DEST};
-
-    for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
-    {
-        HRESULT create_resource_hr = device_->CreateReservedResource(
-            &resource_desc,
-            states[i],
-            nullptr,
-            IID_PPV_ARGS(resources[i]));
-
-        if (create_resource_hr == E_OUTOFMEMORY)
-        {
-            return absl::nullopt;
-        }
-        ORT_THROW_IF_FAILED(create_resource_hr);
-    }
-
-    // Reserve enough heaps to store all tiles in the resource.
-    const uint64_t heap_count =
-        1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_;
-    allocation.heaps.resize(heap_count);
-
-    // Create heaps and map them to the primary reserved resource.
-    D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {};
-    uint64_t unmapped_resource_tiles = resource_size_in_tiles;
-    for (uint64_t i = 0; i < heap_count; i++)
-    {
-        // Create heap. The last heap of the allocation may have fewer tiles to
-        // avoid wasting space.
-        uint64_t heap_size_in_tiles = std::min<uint64_t>(
-            unmapped_resource_tiles,
-            max_heap_size_in_tiles_);
-        uint64_t heap_size_in_bytes =
-            heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-        auto heap_desc = CD3DX12_HEAP_DESC(
-            heap_size_in_bytes,
-            heap_properties_,
-            0,
-            heap_flags_);
-
-        HRESULT create_heap_hr =
-            device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
-        if (create_heap_hr == E_OUTOFMEMORY)
-        {
-            return absl::nullopt;
-        }
-        ORT_THROW_IF_FAILED(create_heap_hr);
-
-        // Source region in the resource to map.
-        D3D12_TILE_REGION_SIZE resource_region_size = {};
-        resource_region_size.NumTiles = static_cast<uint32_t>(heap_size_in_tiles);
-
-        // Target range in the current heap to map.
-        const D3D12_TILE_RANGE_FLAGS tile_range_flags =
-            D3D12_TILE_RANGE_FLAG_NONE;
-        const uint32_t heap_range_start_offset = 0;
-        const uint32_t heap_range_tile_count = static_cast<uint32_t>(heap_size_in_tiles);
-
-        constexpr uint32_t numResourceRegions = 1;
-        constexpr uint32_t numHeapRanges = 1;
-
-        // This is a brand new allocation/resource, so the tile mappings are
-        // guaranteed to be set (on the GPU timeline) by the time any code can
-        // reference the returned resource. We only execute operations on a
-        // single hardware queue so there is no need to wait or signal.
-        //
-        // All resources have identical tile mappings. The repeated call to
-        // UpdateTileMappings on all resources instead of using CopyTileMappings
-        // is intentional: the latter API is not supported by all versions of
-        // PIX.
-        for (auto resource :
-             {allocation.resource_uav_state.Get(),
-              allocation.resource_copy_src_state.Get(),
-              allocation.resource_copy_dst_state.Get()})
-        {
-            queue_->UpdateTileMappings(
-                resource,
-                numResourceRegions,
-                &resource_region_start_coordinates,
-                &resource_region_size,
-                allocation.heaps[i].Get(),
-                numHeapRanges,
-                &tile_range_flags,
-                &heap_range_start_offset,
-                &heap_range_tile_count,
-                D3D12_TILE_MAPPING_FLAG_NONE);
-        }
-
-        resource_region_start_coordinates.X += static_cast<uint32_t>(heap_size_in_tiles);
-        unmapped_resource_tiles -= heap_size_in_tiles;
-    }
-
-    assert(unmapped_resource_tiles == 0);
-
-    return allocation;
-}
-
-absl::optional<Allocation> D3D12HeapAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes)
-{
-    Allocation allocation = {};
-
-    // Create the allocation's sole heap. The allocation may be larger than the
-    // requested size to ensure a whole number of tiles.
-    allocation.heaps.resize(1);
-    D3D12_HEAP_DESC heap_desc =
-        CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_);
-    HRESULT create_heap_hr = device_->CreateHeap(
-        &heap_desc,
-        IID_PPV_ARGS(&allocation.heaps.front()));
-    if (create_heap_hr == E_OUTOFMEMORY)
-    {
-        return absl::nullopt;
-    }
-
-    // Create large placed resource that spans the heap.
-    D3D12_RESOURCE_DESC resource_desc =
-        CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_);
-
-    ID3D12Resource** resources[] = {
-        &allocation.resource_uav_state,
-        &allocation.resource_copy_src_state,
-        &allocation.resource_copy_dst_state};
-    D3D12_RESOURCE_STATES states[] = {
-        initial_state_,
-        D3D12_RESOURCE_STATE_COPY_SOURCE,
-        D3D12_RESOURCE_STATE_COPY_DEST};
-
-    for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
-    {
-        HRESULT create_resource_hr = device_->CreatePlacedResource(
-            allocation.heaps.front().Get(),
-            0,
-            &resource_desc,
-            states[i],
-            nullptr,
-            IID_PPV_ARGS(resources[i]));
-        if (create_resource_hr == E_OUTOFMEMORY)
-        {
-            return absl::nullopt;
-        }
-        ORT_THROW_IF_FAILED(create_resource_hr);
-    }
-
-    return allocation;
-}
-
-uint64_t D3D12HeapAllocator::ComputeRequiredSize(size_t size)
-{
-    const uint64_t resource_size_in_tiles =
-        1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    const uint64_t resource_size_in_bytes =
-        resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-
-    return resource_size_in_bytes;
-}
-
-Microsoft::WRL::ComPtr<DmlResourceWrapper> D3D12HeapAllocator::Alloc(size_t size_in_bytes)
-{
-    if (size_in_bytes == 0)
-    {
-        return nullptr;
-    }
-
-    // The D3D12 device is thread-safe so we don't need to hold the lock while
-    // creating an allocation.
-    absl::optional<Allocation> allocation =
-        tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes)
-                        : TryCreateUntiledAllocation(size_in_bytes);
-
-    ORT_THROW_HR_IF(E_UNEXPECTED, !allocation);
-
-    auto reservedResourceWrapper = wil::MakeOrThrow<DmlReservedResourceWrapper>(std::move(*allocation));
-    Microsoft::WRL::ComPtr<DmlResourceWrapper> resourceWrapper;
-    reservedResourceWrapper.As(&resourceWrapper);
-    return resourceWrapper;
-}
-
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
deleted file mode 100644
index 6e13ad71f5877..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "DmlResourceWrapper.h"
-
-namespace Dml
-{
-
-struct Allocation
-{
-    Microsoft::WRL::ComPtr<ID3D12Heap> heap;
-
-    // Heaps backing the memory for the allocation. If tiling is supported
-    // an allocation may comprise multiple heaps. If tiling is not supported
-    // an allocation will only have a single heap.
-    std::vector<Microsoft::WRL::ComPtr<ID3D12Heap>> heaps;
-
-    // Resources created over this allocation's heaps. All three resources
-    // are identical aside from being fixed in a single resource state: UAV,
-    // COPY_SRC, and COPY_DST respectively. The purpose of duplicate
-    // resources is to enable overlapping resources in different states for
-    // copying data. Most callers will not (and should not) interact
-    // directly with these resources; all three are wrapped by the buffer
-    // regions returned from this allocator, and the appropriate resource
-    // will be used automatically when performing buffer copies.
-    Microsoft::WRL::ComPtr<ID3D12Resource> resource_uav_state;
-    Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_src_state;
-    Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_dst_state;
-};
-
-// An allocator that makes logically contiguous allocations backed by D3D heaps.
-//
-// Heaps must fit entirely in either local or non-local memory. Larger heaps
-// have a greater chance of getting demoted into non-local memory, which can be
-// disastrous for performance. This problem is compounded by the fact that heaps
-// may be demoted even if overall local memory usage is within the process'
-// budget. Heaps are not necessarily mappable to discontiguous regions of
-// physical memory, which means physical memory fragmentation *may* make it
-// extremely difficult to accommodate larger heaps.
-//
-// On D3D hardware that supports tiled resource tier 1+ this class implements
-// large allocations through tiling. Each allocation is backed by however many
-// small heaps are necessary to cover the requested allocation size. Buffer
-// regions retrieved through this allocator are reserved resources that span the
-// full collection of heaps assigned to an individual allocation. Tile mappings
-// are static.
-//
-// On hardware that doesn't support tiled resources each allocation is backed by
-// a single heap. Buffer regions retrieved through this allocator are placed
-// resources that span the full heap assigned to an individual allocation. In
-// this case it is better make more but smaller allocations (resulting in
-// smaller heaps); this fallback path is only retained as a last resort for
-// older hardware.
-class D3D12HeapAllocator
-{
-  public:
-    // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
-    // is 64KB. A default size of 512 tiles (32MB) does a good job of handling
-    // local video memory fragmentation without requiring lots of heaps.
-    static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512;
-
-    D3D12HeapAllocator(
-        ID3D12Device* device,
-        ID3D12CommandQueue* queue,
-        const D3D12_HEAP_PROPERTIES& heap_props,
-        D3D12_HEAP_FLAGS heap_flags,
-        D3D12_RESOURCE_FLAGS resource_flags,
-        D3D12_RESOURCE_STATES initial_state);
-
-    Microsoft::WRL::ComPtr<DmlResourceWrapper> Alloc(size_t size_in_bytes);
-    uint64_t ComputeRequiredSize(size_t size);
-    bool TilingEnabled() const { return tiling_enabled_; };
-
-  private:
-    std::mutex mutex_;
-
-    Microsoft::WRL::ComPtr<ID3D12Device> device_;
-    Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue_;
-    const D3D12_HEAP_PROPERTIES heap_properties_;
-    const D3D12_HEAP_FLAGS heap_flags_;
-    const D3D12_RESOURCE_FLAGS resource_flags_;
-    const D3D12_RESOURCE_STATES initial_state_;
-    bool tiling_enabled_;
-    uint64_t max_heap_size_in_tiles_;
-
-  private:
-    absl::optional<Allocation> TryCreateTiledAllocation(uint64_t size_in_bytes);
-    absl::optional<Allocation> TryCreateUntiledAllocation(
-        uint64_t size_in_bytes);
-
-    friend class D3D12BufferRegion;
-};
-
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h
new file mode 100644
index 0000000000000..de39f0890f998
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "DmlBufferRegion.h"
+#include "DmlAllocationInfo.h"
+
+namespace Dml
+{
+    class DmlManagedBufferRegion : public Microsoft::WRL::RuntimeClass<Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
+    {
+    public:
+        DmlManagedBufferRegion(Microsoft::WRL::ComPtr<AllocationInfo> allocation, D3D12BufferRegion&& bufferRegion)
+            : m_allocation(std::move(allocation)),
+              m_bufferRegion(std::move(bufferRegion))
+        {
+        }
+
+        const D3D12BufferRegion& GetBufferRegion() const { return m_bufferRegion; }
+
+    private:
+        Microsoft::WRL::ComPtr<AllocationInfo> m_allocation;
+        D3D12BufferRegion m_bufferRegion;
+    };
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 413ade92daf51..68feab568ca45 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -1,15 +1,22 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#pragma once
+
 #include "DmlResourceWrapper.h"
-#include "DmlHeapAllocator.h"
+#include "DmlHeapAllocation.h"
+#include "DmlTaggedPointer.h"
 
 namespace Dml
 {
     class DmlReservedResourceWrapper : public Microsoft::WRL::RuntimeClass<Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, DmlResourceWrapper>
     {
     public:
-        DmlReservedResourceWrapper(Allocation&& allocation) : m_allocation(std::move(allocation)) {}
+        DmlReservedResourceWrapper(DmlHeapAllocation&& allocation)
+            : m_allocation(std::move(allocation))
+        {
+        }
+
         ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
         ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
         ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
@@ -19,6 +26,6 @@ namespace Dml
         D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
-        Allocation m_allocation;
+        DmlHeapAllocation m_allocation;
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
new file mode 100644
index 0000000000000..da5ed6df2ff4c
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "DmlTaggedPointer.h"
+#include <cassert>
+
+namespace Dml
+{
+/*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr)
+{
+    uint64_t ptr_val = reinterpret_cast<uint64_t>(ptr);
+
+    static constexpr uint64_t kAllocationIDMask =
+        (1ull << kAllocationIDBits) - 1;
+    static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1;
+
+    TaggedPointer tagged_ptr;
+    tagged_ptr.device_id = (ptr_val >> (kAllocationIDBits + kOffsetBits));
+    tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask;
+    tagged_ptr.offset = (ptr_val & kOffsetMask);
+
+    return tagged_ptr;
+}
+
+/*static*/ void* TaggedPointer::Pack(
+    uint32_t device_id,
+    uint32_t allocation_id,
+    uint64_t offset)
+{
+    assert(device_id < (1ull << kDeviceIDBits));
+    assert(allocation_id < (1ull << kAllocationIDBits));
+    assert(offset < (1ull << kOffsetBits));
+
+    // Store the device ID in the upper bits of the pointer, followed by the
+    // allocation id and the offset in the lower bits
+    uint64_t ptr = ((uint64_t)device_id << (kAllocationIDBits + kOffsetBits)) |
+                   ((uint64_t)allocation_id << kOffsetBits) | offset;
+
+    return reinterpret_cast<void*>(ptr);
+}
+} // namespace tfdml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
new file mode 100644
index 0000000000000..96b0eb318ad48
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <climits>
+#include <cstdint>
+
+namespace Dml
+{
+
+// D3D12HeapAllocator and D3D12DescriptorHeapAllocator encode the allocation ID
+// into the high bits of the pointers it returns, while the low bits are used as
+// an offset into the allocation. Note that since the layout of bitfields is
+// implementation-defined, you can't just cast a void* into a TaggedPointer: it
+// must be done using masks and shifts.
+struct TaggedPointer
+{
+    static constexpr uint64_t kDeviceIDBits = 4;
+    static constexpr uint64_t kAllocationIDBits = 20;
+    static constexpr uint64_t kOffsetBits = 40;
+
+    uint64_t device_id : kDeviceIDBits;
+    uint64_t allocation_id : kAllocationIDBits;
+    uint64_t offset : kOffsetBits;
+
+    static void* Pack(
+        uint32_t device_id,
+        uint32_t allocation_id,
+        uint64_t offset);
+    static TaggedPointer Unpack(const void* ptr);
+};
+
+static_assert(
+    sizeof(TaggedPointer) == sizeof(void*),
+    "DML requires a 64-bit architecture");
+static_assert(
+    TaggedPointer::kDeviceIDBits + TaggedPointer::kAllocationIDBits +
+            TaggedPointer::kOffsetBits ==
+        sizeof(void*) * CHAR_BIT,
+    "DML requires a 64-bit architecture");
+
+} // namespace tfdml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index 1d41d26cf0062..c3415c4b9ea49 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -18,7 +18,7 @@ namespace Dml
         ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));
     }
 
-    void ExecutionContext::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
+    void ExecutionContext::SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator)
     {
         m_dmlRecorder.SetAllocator(allocator);
     }
@@ -68,10 +68,11 @@ namespace Dml
 
     void ExecutionContext::FillBufferWithPattern(
         ID3D12Resource* dstBuffer,
+        uint64_t offset,
         gsl::span<const std::byte> value /* Data type agnostic value, treated as raw bits */)
     {
         SetCommandRecorder(&m_dmlRecorder);
-        m_dmlRecorder.FillBufferWithPattern(dstBuffer, value);
+        m_dmlRecorder.FillBufferWithPattern(dstBuffer, offset, value);
     }
 
     void ExecutionContext::ExecuteCommandList(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index b06f11a5efd0a..6625ae83ffd1e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -20,13 +20,13 @@ namespace Dml
     public:
         // Constructs an ExecutionContext that executes on the supplied queue.
         ExecutionContext(
-            ID3D12Device* d3d12Device, 
-            IDMLDevice* dmlDevice, 
+            ID3D12Device* d3d12Device,
+            IDMLDevice* dmlDevice,
             ID3D12CommandQueue* queue);
 
-        void SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
+        void SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator);
 
-        // Waits for flushed work, discards unflushed work, and discards associated references to 
+        // Waits for flushed work, discards unflushed work, and discards associated references to
         // prevent circular references.  Must be the last call on the object before destruction.
         void Close();
 
@@ -44,6 +44,7 @@ namespace Dml
 
         void FillBufferWithPattern(
             ID3D12Resource* dstBuffer,
+            uint64_t offset,
             gsl::span<const std::byte> value /* Data type agnostic value, treated as raw bits */);
 
         void InitializeOperator(
@@ -75,12 +76,12 @@ namespace Dml
         // Returns an event which will become signaled when everything submitted to the execution context thus far has
         // completed execution on the GPU, including work that has yet to be flushed to the queue.
         GpuEvent GetCurrentCompletionEvent();
-        
+
         // Adds a reference which will be released when queued GPU work is completed
         void QueueReference(IUnknown* object);
 
         // Release any accumulated references who corresponding GPU fence values have
-        // been reached.  
+        // been reached.
         void ReleaseCompletedReferences();
 
         D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index fddd3267d9770..ca9080e4fe665 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -9,6 +9,7 @@
 #include "ReadbackHeap.h"
 #include "ExecutionContext.h"
 #include "BucketizedBufferAllocator.h"
+#include "DmlCpuAllocator.h"
 #include "MLOperatorAuthorImpl.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h"
 #include "core/providers/dml/OperatorAuthorHelper/OperatorHelper.h"
@@ -18,7 +19,10 @@
 #include "core/framework/compute_capability.h"
 #include "core/framework/fallback_cpu_capability.h"
 #include "DmlCommittedResourceWrapper.h"
-#include "DmlHeapAllocator.h"
+#include "DmlBufferRegion.h"
+#include "DmlManagedBufferRegion.h"
+#include "DmlBfcAllocator.h"
+#include "DmlGpuAllocator.h"
 
 #ifdef ERROR
 #undef ERROR
@@ -111,32 +115,32 @@ namespace Dml
 
     HRESULT __stdcall ExecutionProviderImpl::AllocatePooledResource(
         size_t size,
-        AllocatorRoundingMode roundingMode,
-        ID3D12Resource **d3dResource,
-        IUnknown** pooledResource
+        DmlManagedBufferRegion** managedBufferRegion
     ) const noexcept
     {
         ORT_TRY
         {
-        ComPtr<IUnknown> allocation;
-        allocation.Attach(static_cast<IUnknown* >(m_allocator->Alloc(size, roundingMode)));
-
-        const auto* allocInfo = m_allocator->DecodeDataHandle(allocation.Get());
-
-        ComPtr<ID3D12Resource> resource = allocInfo->GetUavResource();
-        resource.CopyTo(d3dResource);
-        *pooledResource = allocation.Detach();
+        void* opaqueData = m_bfcAllocator->Alloc(size);
+        auto bufferRegion = m_subAllocator->CreateManagedBufferRegion(opaqueData, size);
+        bufferRegion.CopyTo(managedBufferRegion);
         return S_OK;
         }
         ORT_CATCH_RETURN
     }
 
-    ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(void* allocation) const noexcept
+    D3D12BufferRegion ExecutionProviderImpl::GetBufferForTensor(IMLOperatorTensor* tensor) const
+    {
+        MLOperatorTensor mlOperatorTensor(tensor);
+        void* data = mlOperatorTensor.GetByteData();
+        auto sizeInBytes = mlOperatorTensor.GetUnalignedTensorByteSize();
+        return m_subAllocator->CreateBufferRegion(data, sizeInBytes);
+    }
+
+    ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept
     {
         ORT_TRY
         {
-            const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(allocation);
-            return allocInfo->GetUavResource();
+            return GetBufferForTensor(tensor).ResourceInUavState();
         }
         ORT_CATCH_GENERIC
         {
@@ -178,7 +182,7 @@ namespace Dml
 
         m_context = std::make_shared<ExecutionContext>(m_d3d12Device.Get(), m_dmlDevice.Get(), queue);
 
-        auto heapAllocator = std::make_unique<D3D12HeapAllocator>(
+        m_subAllocator = std::make_shared<BucketizedBufferAllocator>(
             m_d3d12Device.Get(),
             queue,
             CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
@@ -186,21 +190,25 @@ namespace Dml
             D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
-        // Create an allocator for D3D12 buffers used to hold tensor data. The returned buffers from the allocator
-        // should be DEFAULT heap buffers which can be used as UAVs, and which start in UAV state.
-        m_allocator = std::make_shared<BucketizedBufferAllocator>(
-            m_d3d12Device.Get(),
-            m_context,
-            std::move(heapAllocator));
+        // Create a BFC allocator that encapsulates our allocator
+        onnxruntime::AllocatorCreationInfo memoryInfo(
+            [this](OrtDevice::DeviceId id) {
+                return std::make_unique<DmlBfcAllocator>(m_subAllocator.get());
+            });
+
+        m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo);
+
+        // Wrap the BFC allocator into our own allocator
+        m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_subAllocator.get());
 
-        m_context->SetAllocator(m_allocator);
+        m_context->SetAllocator(m_bfcAllocator);
 
         m_uploadHeap = std::make_unique<PooledUploadHeap>(m_d3d12Device.Get(), m_context);
         m_readbackHeap = std::make_unique<ReadbackHeap>(m_d3d12Device.Get(), m_context);
 
         // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators.
-        m_cpuInputAllocator = std::make_shared<CPUAllocator>(OrtMemType::OrtMemTypeCPUInput);
-        m_cpuOutputAllocator = std::make_shared<CPUAllocator>(OrtMemType::OrtMemTypeCPUOutput);
+        m_cpuInputAllocator = std::make_shared<DmlCpuAllocator>(OrtMemType::OrtMemTypeCPUInput);
+        m_cpuOutputAllocator = std::make_shared<DmlCpuAllocator>(OrtMemType::OrtMemTypeCPUOutput);
 
         CreateDmlKernelRegistry(&m_kernelRegistry, &m_internalRegInfoMap);
     }
@@ -341,10 +349,8 @@ namespace Dml
                 if (tensor)
                 {
                     assert(tensor->IsDataInterface());
-                    const AllocationInfo* allocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(tensor).GetDataInterface().Get());
-                    ID3D12Resource* resource = allocInfo->GetUavResource();
-                    D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
-                    bufferBindings.push_back({ resource, 0, resourceDesc.Width });
+                    auto bufferRegion = GetBufferForTensor(tensor);
+                    bufferBindings.push_back(bufferRegion.GetBufferBinding());
                     bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
                 }
                 else
@@ -431,46 +437,61 @@ namespace Dml
             //
             // CPU -> GPU copy (upload)
             //
-            const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get());
+            auto dstBufferRegion = GetBufferForTensor(dst);
 
-            ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource();
-            const auto dstState = dstAllocInfo->GetDefaultCopyDstState();
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? dstBufferRegion.ResourceInUavState()
+                : dstBufferRegion.ResourceInCopyDstState();
 
-            const void* srcData = src->GetData();
+            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
 
-            const uint64_t dstOffset = 0;
-            m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(srcData, dataSizeInBytes));
+            const uint64_t dstOffset = dstBufferRegion.Offset();
+            m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
         }
         else if (!src->IsCpuData() && dst->IsCpuData())
         {
             //
             // GPU -> CPU copy (readback)
             //
+            auto srcBufferRegion = GetBufferForTensor(src);
 
-            void* dstData = dst->GetData();
-            const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get());
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
 
-            ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource();
-            const auto srcState = srcAllocInfo->GetDefaultCopySrcState();
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
-            const uint64_t srcOffset = 0;
-
-            // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-            m_readbackHeap->ReadbackFromGpu(AsByteSpan(dstData, dataSizeInBytes), srcData, srcOffset, srcState);
+            const uint64_t srcOffset = srcBufferRegion.Offset();
+            m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
             //
             // GPU -> GPU copy
             //
-            const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(src).GetDataInterface().Get());
-            const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(dst).GetDataInterface().Get());
+            auto srcBufferRegion = GetBufferForTensor(src);
+
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+
+            auto dstBufferRegion = GetBufferForTensor(dst);
 
-            ID3D12Resource* srcData = srcAllocInfo->GetCopySrcResource();
-            const auto srcState = srcAllocInfo->GetDefaultCopySrcState();
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? dstBufferRegion.ResourceInUavState()
+                : dstBufferRegion.ResourceInCopyDstState();
 
-            ID3D12Resource* dstData = dstAllocInfo->GetCopyDstResource();
-            const auto dstState = dstAllocInfo->GetDefaultCopyDstState();
+            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
 
             m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes);
         }
@@ -495,9 +516,8 @@ namespace Dml
         auto mlTensor = MLOperatorTensor(dst).GetDataInterface();
         if (mlTensor != nullptr)
         {
-            const AllocationInfo* dstAllocInfo = m_allocator->DecodeDataHandle(mlTensor.Get());
-            ID3D12Resource* dstData = dstAllocInfo->GetUavResource();
-            m_context->FillBufferWithPattern(dstData, rawValue);
+            auto dstBufferRegion = GetBufferForTensor(dst);
+            m_context->FillBufferWithPattern(dstBufferRegion.ResourceInUavState(), dstBufferRegion.Offset(), rawValue);
         }
 
         return S_OK;
@@ -747,6 +767,9 @@ namespace Dml
         std::vector<D3D12_RESOURCE_STATES> srcStates;
         srcStates.reserve(src_dst_pairs.size());
 
+        std::vector<uint64_t> srcOffsets;
+        srcOffsets.reserve(src_dst_pairs.size());
+
         std::vector<void*> dstDatas;
         dstDatas.reserve(src_dst_pairs.size());
 
@@ -790,19 +813,24 @@ namespace Dml
             ORT_THROW_HR_IF(E_INVALIDARG, dataSizesInBytes[i] != ComputeByteSizeFromTensor(srcWrapper)); // Tensors must be the same size
 
             dstDatas.push_back(dstWrapper.GetData());
-            const AllocationInfo* srcAllocInfo = m_allocator->DecodeDataHandle(MLOperatorTensor(&srcWrapper).GetDataInterface().Get());
 
-            auto srcData = srcAllocInfo->GetCopySrcResource();
-            auto srcState = srcAllocInfo->GetDefaultCopySrcState();
+            auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
+
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             srcDatas.push_back(srcData);
             srcStates.push_back(srcState);
+            srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
-        const uint64_t srcOffset = 0;
-
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcStates);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
 
         return onnxruntime::common::Status::OK();
     }
@@ -815,7 +843,7 @@ namespace Dml
 
     void ExecutionProviderImpl::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        m_allocator->SetDefaultRoundingMode(roundingMode);
+        m_subAllocator->SetDefaultRoundingMode(roundingMode);
     }
 
     void ExecutionProviderImpl::ReleaseCompletedReferences()
@@ -840,36 +868,22 @@ namespace Dml
         data->AddRef();
     }
 
-    void ExecutionProviderImpl::GetABIDataInterface(
-        bool isInternalOperator,
-        IUnknown* data,
-        IUnknown** abiData) const
+    void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const
     {
         assert(!m_closed);
+        *abiData = m_subAllocator->GetAllocationInfo(data)->GetUavResource();
+    }
 
-        if (isInternalOperator)
-        {
-            *abiData = data;
-            data->AddRef();
-        }
-        else
-        {
-#ifdef _GAMING_XBOX
-            ComPtr<GraphicsUnknownWrapper> wrappedResource = Microsoft::WRL::Make<GraphicsUnknownWrapper>(m_allocator->DecodeDataHandle(data)->GetUavResource());
-            *abiData = wrappedResource.Detach();
-#else
-            ComPtr<ID3D12Resource> resource = m_allocator->DecodeDataHandle(data)->GetUavResource();
-            *abiData = resource.Detach();
-#endif
-        }
+    void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const
+    {
+        auto managedBufferRegion = m_subAllocator->CreateManagedBufferRegion(data, size);
+        ORT_THROW_IF_FAILED(managedBufferRegion.CopyTo(abiData));
     }
 
-    uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(
-        IUnknown* data,
-        bool isInternalOperator)
+    uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator)
     {
         assert(!isInternalOperator);
-        return m_allocator->DecodeDataHandle(data)->GetPooledResourceId();
+        return m_subAllocator->GetAllocationInfo(data)->GetPooledResourceId();
     }
 
     void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState(
@@ -957,7 +971,7 @@ namespace Dml
 
     std::shared_ptr<onnxruntime::IAllocator> ExecutionProviderImpl::GetGpuAllocator()
     {
-        return m_allocator;
+        return m_bfcAllocator;
     }
 
     std::shared_ptr<onnxruntime::IAllocator> ExecutionProviderImpl::GetCpuInputAllocator()
@@ -994,8 +1008,8 @@ namespace Dml
 
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr)
     {
-        Dml::BucketizedBufferAllocator* pAllocationInfo = static_cast<Dml::BucketizedBufferAllocator*>(allocator);
-        return pAllocationInfo->DecodeDataHandle(ptr)->GetUavResource();
+        Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
+        return pAllocationInfo->GetSubAllocator()->GetAllocationInfo(ptr)->GetUavResource();
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 048230f12723a..22a9aed5dfd48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -5,6 +5,7 @@
 
 #include "GraphTransformer.h"
 #include "core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h"
+#include "DmlBufferRegion.h"
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
@@ -23,8 +24,10 @@ namespace Dml
     class ReadbackHeap;
     class ExecutionContext;
     class BucketizedBufferAllocator;
-    class CPUAllocator;
+    class DmlCpuAllocator;
     class ExecutionProvider;
+    class DmlManagedBufferRegion;
+    class DmlGpuAllocator;
 
     class ExecutionProviderImpl : public WRL::Base<Dml::IExecutionProvider,
                                   Windows::AI::MachineLearning::Adapter::IWinmlExecutionProvider>
@@ -100,13 +103,15 @@ namespace Dml
             IUnknown** dataCopy) const override;
 
         void GetABIDataInterface(
-            bool isInternalOperator,
-            IUnknown* data,
+            void* data,
             IUnknown** abiData) const override;
 
-       uint64_t TryGetPooledAllocationId(
-            IUnknown* data,
-            bool isInternalOperator) override;
+        void GetManagedBufferRegion(
+            void* data,
+            uint64_t size,
+            DmlManagedBufferRegion** abiData) const;
+
+       uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override;
 
         void GetABIExecutionInterfaceAndInvalidateState(
             bool isInternalOperator,
@@ -136,12 +141,10 @@ namespace Dml
         // Allocate a resource from pools.  Releasing pooledResource returns it to the pool.
         STDMETHOD(AllocatePooledResource)(
             size_t size,
-            AllocatorRoundingMode roundingMode,
-            ID3D12Resource **d3dResource,
-            IUnknown* *pooledResource
+            DmlManagedBufferRegion** managedBufferRegion
         ) const noexcept final;
 
-        STDMETHOD_(ID3D12Resource*, DecodeResource)(void* allocation) const noexcept final;
+        STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept final;
 
         std::shared_ptr<onnxruntime::KernelRegistry> GetKernelRegistry() const
         {
@@ -179,6 +182,8 @@ namespace Dml
             uint32_t supportedDeviceDataTypeMask // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         ) const;
 
+        D3D12BufferRegion GetBufferForTensor(IMLOperatorTensor* tensor) const;
+
         ComPtr<ID3D12Device> m_d3d12Device;
         ComPtr<IDMLDevice> m_dmlDevice;
         bool m_isMcdmDevice = false;
@@ -186,9 +191,11 @@ namespace Dml
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<PooledUploadHeap> m_uploadHeap;
         std::unique_ptr<ReadbackHeap> m_readbackHeap;
-        std::shared_ptr<BucketizedBufferAllocator> m_allocator;
-        std::shared_ptr<CPUAllocator> m_cpuInputAllocator;
-        std::shared_ptr<CPUAllocator> m_cpuOutputAllocator;
+        std::shared_ptr<onnxruntime::IAllocator> m_bfcAllocator;
+        std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::shared_ptr<DmlGpuAllocator> m_gpuAllocator;
+        std::shared_ptr<DmlCpuAllocator> m_cpuInputAllocator;
+        std::shared_ptr<DmlCpuAllocator> m_cpuOutputAllocator;
         std::shared_ptr<onnxruntime::KernelRegistry> m_kernelRegistry;
         std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfoMap> m_internalRegInfoMap;
         mutable uint64_t m_partitionKernelPrefixVal = 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index e809a20cc0f4b..8ff33debe2474 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -6,6 +6,7 @@
 #include "MLOperatorAuthorImpl.h"
 #include "FusedGraphKernel.h"
 #include "DmlGraphFusionHelper.h"
+#include "DmlManagedBufferRegion.h"
 
 using namespace Windows::AI::MachineLearning::Adapter;
 
@@ -63,13 +64,14 @@ namespace Dml
             UINT64 persistentResourceSize = m_compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
+                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
                 ORT_THROW_IF_FAILED(m_provider->AllocatePooledResource(
                     static_cast<size_t>(persistentResourceSize),
-                    AllocatorRoundingMode::Disabled,
-                    m_persistentResource.GetAddressOf(),
-                    m_persistentResourceAllocatorUnk.GetAddressOf()));
+                    managedBufferRegion.GetAddressOf()));
 
-                m_persistentResourceBinding = DML_BUFFER_BINDING { m_persistentResource.Get(), 0, persistentResourceSize };
+                managedBufferRegion.As(&m_persistentResourceAllocatorUnk);
+                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
+                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
             }
 
             ORT_THROW_IF_FAILED(m_provider->InitializeOperator(
@@ -128,7 +130,7 @@ namespace Dml
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
                         ORT_THROW_IF_FAILED(contextWrapper.GetInputTensor(i, inputTensors[i].GetAddressOf()));
-                        inputPtrs[i] = m_provider->DecodeResource(MLOperatorTensor(inputTensors[i].Get()).GetDataInterface().Get());
+                        inputPtrs[i] = m_provider->DecodeResource(inputTensors[i].Get());
                     }
                 }
 
@@ -166,7 +168,7 @@ namespace Dml
                     if (tensor)
                     {
                         assert(tensor->IsDataInterface());
-                        ID3D12Resource* resource = m_provider->DecodeResource(MLOperatorTensor(tensor).GetDataInterface().Get());
+                        ID3D12Resource* resource = m_provider->DecodeResource(tensor);
                         D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
                         bufferBindings.push_back({ resource, 0, resourceDesc.Width });
                         bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
@@ -363,13 +365,11 @@ namespace Dml
                 uint64_t tempAllocId = 0;
                 ORT_THROW_IF_FAILED(contextWrapper.AllocateTemporaryData(static_cast<size_t>(execBindingProps.TemporaryResourceSize), tempAlloc.GetAddressOf(), &tempAllocId));
 
-                ComPtr<IUnknown> tempResourceUnk;
-                m_winmlProvider->GetABIDataInterface(false, tempAlloc.Get(), &tempResourceUnk);
+                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
+                m_winmlProvider->GetManagedBufferRegion(tempAlloc.Get(), execBindingProps.TemporaryResourceSize, &managedBufferRegion);
 
                 // Bind the temporary resource.
-                ComPtr<ID3D12Resource> tempResource;
-                ORT_THROW_IF_FAILED(tempResourceUnk->QueryInterface(tempResource.GetAddressOf()));
-                DML_BUFFER_BINDING tempBufferBinding = {tempResource.Get(), 0, execBindingProps.TemporaryResourceSize};
+                DML_BUFFER_BINDING tempBufferBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
                 DML_BINDING_DESC tempBindingDesc = { DML_BINDING_TYPE_BUFFER, &tempBufferBinding };
 
                 if (!tempAllocId || m_tempBindingAllocId != tempAllocId)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
index b4baf62ab73f5..4bef0652763a9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
@@ -6,6 +6,8 @@
 
 namespace Dml
 {
+    class DmlManagedBufferRegion;
+
     struct Binding
     {
         // Non-null if required at the stage where it is used, i.e. Initialization
@@ -37,7 +39,7 @@ namespace Dml
             _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding,
             gsl::span<const DML_BUFFER_BINDING> inputTensors
             ) const noexcept = 0;
-        
+
         STDMETHOD(ExecuteOperator)(
             IDMLCompiledOperator* op,
             _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding,
@@ -64,8 +66,8 @@ namespace Dml
         STDMETHOD_(D3D12_COMMAND_LIST_TYPE, GetCommandListTypeForQueue)() const noexcept = 0;
         STDMETHOD_(void, Flush)() const noexcept = 0;
 
-        STDMETHOD_(ID3D12Resource*, DecodeResource)(void* allocation) const noexcept = 0;
-        STDMETHOD(AllocatePooledResource(size_t size, AllocatorRoundingMode roundingMode, ID3D12Resource **d3dResource, IUnknown* *pooledResource)) const noexcept = 0;
+        STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept = 0;
+        STDMETHOD(AllocatePooledResource(size_t size, DmlManagedBufferRegion** pooledResource)) const noexcept = 0;
 
         STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0;
         STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 197c62283fba9..0e63f2c5be0f9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -104,20 +104,11 @@ namespace Windows::AI::MachineLearning::Adapter
     // kernels are registered.
     void TranslateAllocationDataToAbi(
         IWinmlExecutionProvider* winmlProvider,
-        bool isInternalOperator,
         const ::OrtMemoryInfo& allocInfo,
-        IUnknown* allocation,
+        void* opaqueData,
         IUnknown** abiAllocation)
     {
-        if (winmlProvider)
-        {
-            winmlProvider->GetABIDataInterface(isInternalOperator, allocation, abiAllocation);
-        }
-        else
-        {
-            ComPtr<IUnknown> tmp = allocation;
-            *abiAllocation = tmp.Detach();
-        }
+        winmlProvider->GetABIDataInterface(opaqueData, abiAllocation);
     }
 
     //
@@ -1143,7 +1134,7 @@ namespace Windows::AI::MachineLearning::Adapter
             if (operatorGraphDesc->nodesAsOpDesc)
             {
                 m_graphNodeCreateInfo->nodesAsOperatorDesc = std::vector<std::unique_ptr<AbstractOperatorDesc>>();
-                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) 
+                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
                 {
                     auto* node = operatorGraphDesc->nodesAsOpDesc[nodeIndex];
                     assert(node != nullptr);
@@ -1154,7 +1145,7 @@ namespace Windows::AI::MachineLearning::Adapter
             else
             {
                 m_graphNodeCreateInfo->nodesAsIDMLOperator = std::vector<Microsoft::WRL::ComPtr<IDMLOperator>>();
-                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) 
+                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
                 {
                     auto* node = operatorGraphDesc->nodesAsIDMLOperator[nodeIndex];
                     assert(node != nullptr);
@@ -1301,7 +1292,6 @@ namespace Windows::AI::MachineLearning::Adapter
                         // kernels (i.e. ID3D12Resource, versus something that tracks the layout).
                         TranslateAllocationDataToAbi(
                             m_winmlExecutionProvider.Get(),
-                            m_internalOperator,
                             m_impl->Location(),
                             m_dataInterfaceOrShadowCopy ? m_dataInterfaceOrShadowCopy.Get() : m_dataInterface.Get(),
                             m_abiDataInterface.GetAddressOf());
@@ -1667,7 +1657,7 @@ namespace Windows::AI::MachineLearning::Adapter
 
             *allocId = m_winmlProvider->TryGetPooledAllocationId(allocation.Get(), 0);
 
-            TranslateAllocationDataToAbi(m_winmlProvider.Get(), m_internalOperator, alloc->Info(), allocation.Get(), abiAllocation);
+            TranslateAllocationDataToAbi(m_winmlProvider.Get(), alloc->Info(), allocation.Get(), abiAllocation);
 
             if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
             {
@@ -2307,7 +2297,7 @@ namespace Windows::AI::MachineLearning::Adapter
     }
 
     std::tuple<std::unique_ptr<std::byte[]>, size_t> UnpackTensor(
-        const onnx::TensorProto& initializer, 
+        const onnx::TensorProto& initializer,
         const onnxruntime::Path& modelPath)
     {
         std::unique_ptr<std::byte[]> unpackedTensor;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 3ae29629efbcd..2d99c8a6dd6df 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -3,6 +3,7 @@
 
 #include "precomp.h"
 #include "DmlOperator.h"
+#include "../DmlManagedBufferRegion.h"
 
 namespace Dml
 {
@@ -93,13 +94,14 @@ namespace Dml
             UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
+                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
                 ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource(
                     static_cast<size_t>(persistentResourceSize),
-                    AllocatorRoundingMode::Enabled,
-                    m_persistentResource.GetAddressOf(),
-                    m_persistentResourcePoolingUnk.GetAddressOf()));
+                    managedBufferRegion.GetAddressOf()));
 
-                m_persistentResourceBinding = DML_BUFFER_BINDING{ m_persistentResource.Get(), 0, persistentResourceSize };
+                managedBufferRegion.As(&m_persistentResourcePoolingUnk);
+                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
+                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
             }
 
             std::vector<DML_BUFFER_BINDING> initializationInputBindings(m_kernelInputIndices.size());
@@ -192,13 +194,14 @@ namespace Dml
             UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
+                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
                 ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource(
                     static_cast<size_t>(persistentResourceSize),
-                    AllocatorRoundingMode::Enabled,
-                    m_persistentResource.GetAddressOf(),
-                    m_persistentResourcePoolingUnk.GetAddressOf()));
+                    managedBufferRegion.GetAddressOf()));
 
-                m_persistentResourceBinding = DML_BUFFER_BINDING{ m_persistentResource.Get(), 0, persistentResourceSize };
+                managedBufferRegion.As(&m_persistentResourcePoolingUnk);
+                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
+                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
             }
 
             std::vector<DML_BUFFER_BINDING> initializationInputBindings(m_kernelInputIndices.size());
@@ -229,14 +232,16 @@ namespace Dml
             if (!m_persistentResource || m_persistentResource->GetDesc().Width < persistentResourceSize)
             {
                 m_persistentResource = nullptr;
+
+                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
                 ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource(
                     static_cast<size_t>(persistentResourceSize),
-                    AllocatorRoundingMode::Enabled,
-                    m_persistentResource.GetAddressOf(),
-                    m_persistentResourcePoolingUnk.GetAddressOf()));
-            }
+                    managedBufferRegion.GetAddressOf()));
 
-            m_persistentResourceBinding = DML_BUFFER_BINDING{ m_persistentResource.Get(), 0, persistentResourceSize };
+                managedBufferRegion.As(&m_persistentResourcePoolingUnk);
+                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
+                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
+            }
         }
 
         ORT_THROW_IF_FAILED(m_executionProvider->InitializeOperator(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index 590dffef488e4..a91886c3b5863 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -105,6 +105,7 @@ namespace Dml
         gsl::span<void*> dst,
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
+        gsl::span<uint64_t> srcOffsets,
         gsl::span<const D3D12_RESOURCE_STATES> srcStates)
     {
         assert(dst.size() == src.size());
@@ -132,7 +133,7 @@ namespace Dml
                 offset,
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
-                0,
+                srcOffsets[i],
                 srcStates[i],
                 dstSizes[i]);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index 9727dc6ac8752..f888f0a55ac48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -27,6 +27,7 @@ namespace Dml
             gsl::span<void*> dst,
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
+            gsl::span<uint64_t> srcOffsets,
             gsl::span<const D3D12_RESOURCE_STATES> srcStates);
 
     private:
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index d79b2fb4e7c2a..04b30f75b340e 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -455,15 +455,11 @@ class MLOperatorTensor
     // needing to agnostically copy memory.
     const void* GetByteData() const
     {
-        ML_CHECK_BOOL(!IsDataInterface());
-
         return m_impl->GetData();
     }
 
     void* GetByteData()
     {
-        ML_CHECK_BOOL(!IsDataInterface());
-
         return m_impl->GetData();
     }
 

From 76328becb9e23d528c813553f2b9baa02c523995 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 24 Jan 2023 18:20:02 -0800
Subject: [PATCH 09/76] WIP

---
 .../inc/IWinmlExecutionProvider.h             |  5 ---
 .../src/BucketizedBufferAllocator.cpp         | 12 +++---
 .../src/BucketizedBufferAllocator.h           |  7 ++--
 .../src/ExecutionProvider.cpp                 | 16 ++------
 .../src/ExecutionProvider.h                   |  7 +---
 .../src/MLOperatorAuthorImpl.cpp              | 40 +++++--------------
 .../src/MLOperatorAuthorImpl.h                |  2 -
 7 files changed, 26 insertions(+), 63 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 52f5a104b0379..a56f03e50a9e1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -39,11 +39,6 @@ namespace Windows::AI::MachineLearning::Adapter
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
 
-        virtual void GetShadowCopyIfRequired(
-            bool isInternalOperator,
-            IUnknown* data,
-            IUnknown** dataCopy) const = 0;
-
         virtual void GetABIDataInterface(
             void* data,
             IUnknown** abiData) const = 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index df12c1567d5be..66405dd5d2989 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -72,12 +72,14 @@ namespace Dml
 
     BucketizedBufferAllocator::BucketizedBufferAllocator(
         ID3D12Device* device,
+        std::shared_ptr<ExecutionContext> context,
         ID3D12CommandQueue* queue,
         const D3D12_HEAP_PROPERTIES& heap_props,
         D3D12_HEAP_FLAGS heap_flags,
         D3D12_RESOURCE_FLAGS resource_flags,
         D3D12_RESOURCE_STATES initial_state)
-        : device_(device),
+        : m_device(device),
+        m_context(context),
         queue_(queue),
         heap_properties_(heap_props),
         heap_flags_(heap_flags),
@@ -113,7 +115,7 @@ namespace Dml
 
         for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
         {
-            HRESULT create_resource_hr = device_->CreateReservedResource(
+            HRESULT create_resource_hr = m_device->CreateReservedResource(
                 &resource_desc,
                 states[i],
                 nullptr,
@@ -150,7 +152,7 @@ namespace Dml
                 heap_flags_);
 
             HRESULT create_heap_hr =
-                device_->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
+                m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
             if (create_heap_hr == E_OUTOFMEMORY)
             {
                 return absl::nullopt;
@@ -215,7 +217,7 @@ namespace Dml
         allocation.heaps.resize(1);
         D3D12_HEAP_DESC heap_desc =
             CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_);
-        HRESULT create_heap_hr = device_->CreateHeap(
+        HRESULT create_heap_hr = m_device->CreateHeap(
             &heap_desc,
             IID_PPV_ARGS(&allocation.heaps.front()));
         if (create_heap_hr == E_OUTOFMEMORY)
@@ -238,7 +240,7 @@ namespace Dml
 
         for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
         {
-            HRESULT create_resource_hr = device_->CreatePlacedResource(
+            HRESULT create_resource_hr = m_device->CreatePlacedResource(
                 allocation.heaps.front().Get(),
                 0,
                 &resource_desc,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index f21d174500fcb..b28bdba544766 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -45,6 +45,7 @@ namespace Dml
 
         BucketizedBufferAllocator(
             ID3D12Device* device,
+            std::shared_ptr<ExecutionContext> context,
             ID3D12CommandQueue* queue,
             const D3D12_HEAP_PROPERTIES& heap_props,
             D3D12_HEAP_FLAGS heap_flags,
@@ -107,13 +108,10 @@ namespace Dml
         friend class AllocationInfo;
         void FreeResource(void* p, uint64_t resourceId);
 
-        ComPtr<ID3D12Device> m_device;
-
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled;
-        std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<BucketizedBufferAllocator> m_subAllocator;
 
     #if _DEBUG
@@ -123,7 +121,8 @@ namespace Dml
 
         std::mutex mutex_;
 
-        Microsoft::WRL::ComPtr<ID3D12Device> device_;
+        Microsoft::WRL::ComPtr<ID3D12Device> m_device;
+        std::shared_ptr<ExecutionContext> m_context;
         Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue_;
         const D3D12_HEAP_PROPERTIES heap_properties_;
         const D3D12_HEAP_FLAGS heap_flags_;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index ca9080e4fe665..67027e64c5a7b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -184,6 +184,7 @@ namespace Dml
 
         m_subAllocator = std::make_shared<BucketizedBufferAllocator>(
             m_d3d12Device.Get(),
+            m_context,
             queue,
             CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
             D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS,
@@ -857,21 +858,12 @@ namespace Dml
         m_context->QueueReference(object);
     }
 
-    void ExecutionProviderImpl::GetShadowCopyIfRequired(
-        bool isInternalOperator,
-        IUnknown* data,
-        IUnknown** dataCopy) const
-    {
-        assert(!m_closed);
-
-        *dataCopy = data;
-        data->AddRef();
-    }
-
     void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const
     {
         assert(!m_closed);
-        *abiData = m_subAllocator->GetAllocationInfo(data)->GetUavResource();
+        auto uavResource = m_subAllocator->GetAllocationInfo(data)->GetUavResource();
+        uavResource->AddRef();
+        *abiData = uavResource;
     }
 
     void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 22a9aed5dfd48..eec8f08848833 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -97,11 +97,6 @@ namespace Dml
         // IWinmlExecutionProvider methods
         void QueueReference(IUnknown* object) override;
 
-        void GetShadowCopyIfRequired(
-            bool isInternalOperator,
-            IUnknown* data,
-            IUnknown** dataCopy) const override;
-
         void GetABIDataInterface(
             void* data,
             IUnknown** abiData) const override;
@@ -191,8 +186,8 @@ namespace Dml
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<PooledUploadHeap> m_uploadHeap;
         std::unique_ptr<ReadbackHeap> m_readbackHeap;
-        std::shared_ptr<onnxruntime::IAllocator> m_bfcAllocator;
         std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::shared_ptr<onnxruntime::IAllocator> m_bfcAllocator;
         std::shared_ptr<DmlGpuAllocator> m_gpuAllocator;
         std::shared_ptr<DmlCpuAllocator> m_cpuInputAllocator;
         std::shared_ptr<DmlCpuAllocator> m_cpuOutputAllocator;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 0e63f2c5be0f9..d601a2b3b4025 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1273,39 +1273,21 @@ namespace Windows::AI::MachineLearning::Adapter
     {
         if (impl)
         {
+            m_tensorData = m_impl->MutableDataRaw();
+
             if (isDataInterface)
             {
-                // We assume that all data handles derive from IUnknown as their first base.
-                m_dataInterface = static_cast<IUnknown*>(m_impl->MutableDataRaw());
-
-                if (m_dataInterface)
+                if (m_tensorData)
                 {
-                    if (m_winmlExecutionProvider)
-                    {
-                        // The resource may require conversion to the layout expected according to the kernel options.
-                        // This will return either the original object or a shadow copy which uses a different layout.
-                        // This pattern assumes that Lotus is not re-using tensor allocations, so each output is
-                        // a fresh allocation which will not trigger a conversion in the provider.
-                        m_winmlExecutionProvider->GetShadowCopyIfRequired(m_internalOperator, m_dataInterface.Get(), m_dataInterfaceOrShadowCopy.GetAddressOf());
-
-                        // Get the actual object to be returned from the ABI, which varies for internal and external
-                        // kernels (i.e. ID3D12Resource, versus something that tracks the layout).
-                        TranslateAllocationDataToAbi(
-                            m_winmlExecutionProvider.Get(),
-                            m_impl->Location(),
-                            m_dataInterfaceOrShadowCopy ? m_dataInterfaceOrShadowCopy.Get() : m_dataInterface.Get(),
-                            m_abiDataInterface.GetAddressOf());
-                    }
-                    else
-                    {
-                        m_abiDataInterface = m_dataInterface;
-                    }
+                    // Get the actual object to be returned from the ABI, which varies for internal and external
+                    // kernels (i.e. ID3D12Resource, versus something that tracks the layout).
+                    TranslateAllocationDataToAbi(
+                        m_winmlExecutionProvider.Get(),
+                        m_impl->Location(),
+                        m_tensorData,
+                        m_abiDataInterface.GetAddressOf());
                 }
             }
-            else
-            {
-                m_tensorData = m_impl->MutableDataRaw();
-            }
         }
     }
 
@@ -1383,7 +1365,7 @@ namespace Windows::AI::MachineLearning::Adapter
             return nullptr;
         }
 
-        return m_isDataInterface ? nullptr : m_tensorData;
+        return m_tensorData;
     }
 
     void STDMETHODCALLTYPE TensorWrapper::GetDataInterface(IUnknown** dataInterface) noexcept
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index dd1b743587ab5..7e308989791f8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -283,12 +283,10 @@ class TensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
     bool m_internalOperator = false;
 
     void* m_tensorData = nullptr;
-    ComPtr<IUnknown> m_dataInterface;
     bool m_isDataInterface = false;
 
     // The returned data may be a converted shadow copy, and the piece of it which
     // is returned may vary according to kernel registration options.
-    ComPtr<IUnknown> m_dataInterfaceOrShadowCopy;
     ComPtr<IUnknown> m_abiDataInterface;
 
 };

From 7bd0983b7b9e8ae174376638cb2e233d4a372581 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 24 Jan 2023 18:26:54 -0800
Subject: [PATCH 10/76] WIP

---
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 67027e64c5a7b..3a28ea6f7b47b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -202,7 +202,7 @@ namespace Dml
         // Wrap the BFC allocator into our own allocator
         m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_subAllocator.get());
 
-        m_context->SetAllocator(m_bfcAllocator);
+        m_context->SetAllocator(m_gpuAllocator);
 
         m_uploadHeap = std::make_unique<PooledUploadHeap>(m_d3d12Device.Get(), m_context);
         m_readbackHeap = std::make_unique<ReadbackHeap>(m_d3d12Device.Get(), m_context);

From 0c35fc2f5df93b9ff196d5b6b82ca35a99d5b9c9 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 24 Jan 2023 18:51:19 -0800
Subject: [PATCH 11/76] WIP

---
 .../DmlExecutionProvider/src/DmlGpuAllocator.h  | 17 ++++++++++-------
 .../src/ExecutionProvider.cpp                   | 14 +++++++-------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index 554a4dca8e550..1d4b35506afcb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -11,7 +11,7 @@ namespace Dml
     class DmlGpuAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, BucketizedBufferAllocator* subAllocator)
+        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator)
         : onnxruntime::IAllocator(
             OrtMemoryInfo(
                 "DML",
@@ -20,12 +20,15 @@ namespace Dml
             )
         ),
         m_bfcAllocator(bfcAllocator),
-        m_subAllocator(subAllocator) {}
+        m_subAllocator(std::move(subAllocator)) {}
 
-        void* Alloc(size_t size_in_bytes) { return m_bfcAllocator->Alloc(size_in_bytes); }
-        void Free(void* ptr) { m_bfcAllocator->Free(ptr); }
-
-        BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator; }
+        void* Alloc(size_t size_in_bytes) final { return m_bfcAllocator->Alloc(size_in_bytes); }
+        void Free(void* ptr) final { m_bfcAllocator->Free(ptr); }
+        D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); }
+        ComPtr<DmlManagedBufferRegion> CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes); }
+        AllocationInfo* GetAllocationInfo(const void* ptr) { return m_subAllocator->GetAllocationInfo(ptr); }
+        void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { m_subAllocator->SetDefaultRoundingMode(roundingMode); }
+        BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator.get(); }
 
     private:
         // This allocator is managed by ORT and should be used to allocate/free memory in order
@@ -34,6 +37,6 @@ namespace Dml
 
         // This allocator is specific to DML and is used to decode the opaque data returned by the BFC
         // allocator into objects that DML understands
-        BucketizedBufferAllocator* m_subAllocator;
+        std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 3a28ea6f7b47b..85ffbddb84989 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -121,7 +121,7 @@ namespace Dml
         ORT_TRY
         {
         void* opaqueData = m_bfcAllocator->Alloc(size);
-        auto bufferRegion = m_subAllocator->CreateManagedBufferRegion(opaqueData, size);
+        auto bufferRegion = m_gpuAllocator->CreateManagedBufferRegion(opaqueData, size);
         bufferRegion.CopyTo(managedBufferRegion);
         return S_OK;
         }
@@ -133,7 +133,7 @@ namespace Dml
         MLOperatorTensor mlOperatorTensor(tensor);
         void* data = mlOperatorTensor.GetByteData();
         auto sizeInBytes = mlOperatorTensor.GetUnalignedTensorByteSize();
-        return m_subAllocator->CreateBufferRegion(data, sizeInBytes);
+        return m_gpuAllocator->CreateBufferRegion(data, sizeInBytes);
     }
 
     ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept
@@ -200,7 +200,7 @@ namespace Dml
         m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo);
 
         // Wrap the BFC allocator into our own allocator
-        m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_subAllocator.get());
+        m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_subAllocator);
 
         m_context->SetAllocator(m_gpuAllocator);
 
@@ -844,7 +844,7 @@ namespace Dml
 
     void ExecutionProviderImpl::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        m_subAllocator->SetDefaultRoundingMode(roundingMode);
+        m_gpuAllocator->SetDefaultRoundingMode(roundingMode);
     }
 
     void ExecutionProviderImpl::ReleaseCompletedReferences()
@@ -861,21 +861,21 @@ namespace Dml
     void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const
     {
         assert(!m_closed);
-        auto uavResource = m_subAllocator->GetAllocationInfo(data)->GetUavResource();
+        auto uavResource = m_gpuAllocator->GetAllocationInfo(data)->GetUavResource();
         uavResource->AddRef();
         *abiData = uavResource;
     }
 
     void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const
     {
-        auto managedBufferRegion = m_subAllocator->CreateManagedBufferRegion(data, size);
+        auto managedBufferRegion = m_gpuAllocator->CreateManagedBufferRegion(data, size);
         ORT_THROW_IF_FAILED(managedBufferRegion.CopyTo(abiData));
     }
 
     uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator)
     {
         assert(!isInternalOperator);
-        return m_subAllocator->GetAllocationInfo(data)->GetPooledResourceId();
+        return m_gpuAllocator->GetAllocationInfo(data)->GetPooledResourceId();
     }
 
     void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState(

From 43c47b99d87831af751e1a367a8ea13dc253f7f4 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 24 Jan 2023 21:27:04 -0800
Subject: [PATCH 12/76] WIP

---
 .../src/DmlCommandRecorder.cpp                | 11 ++--
 .../src/DmlCommandRecorder.h                  |  7 ++-
 .../src/DmlGpuAllocator.cpp                   | 53 +++++++++++++++++++
 .../src/DmlGpuAllocator.h                     | 30 +++++------
 .../src/ExecutionContext.cpp                  |  3 +-
 .../src/ExecutionContext.h                    |  3 +-
 .../src/ExecutionProvider.cpp                 | 12 ++---
 7 files changed, 81 insertions(+), 38 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 22161a6a58cbf..f60d11fcebf4d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -4,7 +4,6 @@
 #include "precomp.h"
 #include "DmlCommandRecorder.h"
 #include "CommandQueue.h"
-#include "BucketizedBufferAllocator.h"
 #include "absl/cleanup/cleanup.h"
 
 using namespace Dml;
@@ -23,15 +22,11 @@ DmlCommandRecorder::DmlCommandRecorder(
     ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder)));
 }
 
-void DmlCommandRecorder::SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator)
+void DmlCommandRecorder::SetAllocator(std::weak_ptr<DmlGpuAllocator> allocator)
 {
     m_allocator = allocator;
 }
 
-void DmlCommandRecorder::SetSubAllocator(std::weak_ptr<BucketizedBufferAllocator> subAllocator)
-{
-    m_subAllocator = subAllocator;
-}
 
 void DmlCommandRecorder::InitializeOperator(
     IDMLCompiledOperator* op,
@@ -74,7 +69,7 @@ void DmlCommandRecorder::InitializeOperator(
         }
         absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); });
 
-        auto subAllocator = m_subAllocator.lock();
+        auto subAllocator = m_allocator.lock();
         auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize);
 
         // Bind the temporary resource.
@@ -154,7 +149,7 @@ void DmlCommandRecorder::ExecuteOperator(
         }
         absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); });
 
-        auto subAllocator = m_subAllocator.lock();
+        auto subAllocator = m_allocator.lock();
         auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize);
 
         // Bind the temporary resource.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index 2bf23062a49f7..e442df1f1df6c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -6,6 +6,7 @@
 #include "ICommandRecorder.h"
 #include "CommandAllocatorRing.h"
 #include "core/framework/allocator.h"
+#include "DmlGpuAllocator.h"
 
 namespace Dml
 {
@@ -56,8 +57,7 @@ namespace Dml
         void Open() final;
         void CloseAndExecute() final;
 
-        void SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator);
-        void SetSubAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
+        void SetAllocator(std::weak_ptr<DmlGpuAllocator> allocator);
 
         bool HasUnsubmittedWork() override
         {
@@ -84,8 +84,7 @@ namespace Dml
         ID3D12DescriptorHeap* m_currentDescriptorHeap = nullptr;
 
         // The weak pointer avoids a circular reference from context->recorder->allocator->context
-        std::weak_ptr<onnxruntime::IAllocator> m_allocator;
-        std::weak_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::weak_ptr<DmlGpuAllocator> m_allocator;
 
         CommandAllocatorRing<2> m_commandAllocatorRing;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
new file mode 100644
index 0000000000000..8e8db740b41de
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "precomp.h"
+#include "DmlGpuAllocator.h"
+#include "core/framework/allocator.h"
+#include "BucketizedBufferAllocator.h"
+
+namespace Dml
+{
+    DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator)
+    : onnxruntime::IAllocator(
+        OrtMemoryInfo(
+            "DML",
+            OrtAllocatorType::OrtDeviceAllocator,
+            OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
+        )
+    ),
+    m_bfcAllocator(bfcAllocator),
+    m_subAllocator(std::move(subAllocator)) {}
+
+    void* DmlGpuAllocator::Alloc(size_t size_in_bytes)
+    {
+        return m_bfcAllocator->Alloc(size_in_bytes);
+    }
+
+    void DmlGpuAllocator::Free(void* ptr)
+    {
+        m_bfcAllocator->Free(ptr);
+    }
+
+    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const void* ptr, uint64_t size_in_bytes)
+    {
+        return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes);
+    }
+
+    ComPtr<DmlManagedBufferRegion> DmlGpuAllocator::CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes)
+    {
+        return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes);
+    }
+
+    AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const void* ptr)
+    {
+        return m_subAllocator->GetAllocationInfo(ptr);
+    }
+
+    void DmlGpuAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
+    {
+        m_subAllocator->SetDefaultRoundingMode(roundingMode);
+    }
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index 1d4b35506afcb..b12c990d44565 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -4,31 +4,25 @@
 #pragma once
 
 #include "core/framework/allocator.h"
-#include "BucketizedBufferAllocator.h"
+#include "DmlBufferRegion.h"
+#include "DmlManagedBufferRegion.h"
 
 namespace Dml
 {
+    class BucketizedBufferAllocator;
+    class AllocationInfo;
+
     class DmlGpuAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator)
-        : onnxruntime::IAllocator(
-            OrtMemoryInfo(
-                "DML",
-                OrtAllocatorType::OrtDeviceAllocator,
-                OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
-            )
-        ),
-        m_bfcAllocator(bfcAllocator),
-        m_subAllocator(std::move(subAllocator)) {}
+        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator);
 
-        void* Alloc(size_t size_in_bytes) final { return m_bfcAllocator->Alloc(size_in_bytes); }
-        void Free(void* ptr) final { m_bfcAllocator->Free(ptr); }
-        D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes); }
-        ComPtr<DmlManagedBufferRegion> CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes) { return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes); }
-        AllocationInfo* GetAllocationInfo(const void* ptr) { return m_subAllocator->GetAllocationInfo(ptr); }
-        void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { m_subAllocator->SetDefaultRoundingMode(roundingMode); }
-        BucketizedBufferAllocator* GetSubAllocator() const { return m_subAllocator.get(); }
+        void* Alloc(size_t size_in_bytes) final;
+        void Free(void* ptr) final;
+        D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes);
+        ComPtr<DmlManagedBufferRegion> CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes);
+        AllocationInfo* GetAllocationInfo(const void* ptr);
+        void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
 
     private:
         // This allocator is managed by ORT and should be used to allocate/free memory in order
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index c3415c4b9ea49..6a30e6cd1ad56 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -4,6 +4,7 @@
 #include "precomp.h"
 #include "ExecutionContext.h"
 #include "CommandQueue.h"
+#include "DmlGpuAllocator.h"
 
 namespace Dml
 {
@@ -18,7 +19,7 @@ namespace Dml
         ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));
     }
 
-    void ExecutionContext::SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator)
+    void ExecutionContext::SetAllocator(std::weak_ptr<DmlGpuAllocator> allocator)
     {
         m_dmlRecorder.SetAllocator(allocator);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index 6625ae83ffd1e..6e2d205f48ebd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -10,6 +10,7 @@
 namespace Dml
 {
     class CommandQueue;
+    class DmlGpuAllocator;
 
     // Asynchronously performs GPU work, and automatically manages command list recording and submission to queues.
     // Work submitted to the ExecutionContext is typically recorded onto a command list and may not immediately begin
@@ -24,7 +25,7 @@ namespace Dml
             IDMLDevice* dmlDevice,
             ID3D12CommandQueue* queue);
 
-        void SetAllocator(std::weak_ptr<onnxruntime::IAllocator> allocator);
+        void SetAllocator(std::weak_ptr<DmlGpuAllocator> allocator);
 
         // Waits for flushed work, discards unflushed work, and discards associated references to
         // prevent circular references.  Must be the last call on the object before destruction.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 85ffbddb84989..1f6aafda1cc9b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -120,7 +120,7 @@ namespace Dml
     {
         ORT_TRY
         {
-        void* opaqueData = m_bfcAllocator->Alloc(size);
+        void* opaqueData = m_gpuAllocator->Alloc(size);
         auto bufferRegion = m_gpuAllocator->CreateManagedBufferRegion(opaqueData, size);
         bufferRegion.CopyTo(managedBufferRegion);
         return S_OK;
@@ -182,7 +182,7 @@ namespace Dml
 
         m_context = std::make_shared<ExecutionContext>(m_d3d12Device.Get(), m_dmlDevice.Get(), queue);
 
-        m_subAllocator = std::make_shared<BucketizedBufferAllocator>(
+        auto subAllocator = std::make_shared<BucketizedBufferAllocator>(
             m_d3d12Device.Get(),
             m_context,
             queue,
@@ -193,14 +193,14 @@ namespace Dml
 
         // Create a BFC allocator that encapsulates our allocator
         onnxruntime::AllocatorCreationInfo memoryInfo(
-            [this](OrtDevice::DeviceId id) {
-                return std::make_unique<DmlBfcAllocator>(m_subAllocator.get());
+            [subAllocator](OrtDevice::DeviceId id) {
+                return std::make_unique<DmlBfcAllocator>(subAllocator.get());
             });
 
         m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo);
 
         // Wrap the BFC allocator into our own allocator
-        m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_subAllocator);
+        m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), subAllocator);
 
         m_context->SetAllocator(m_gpuAllocator);
 
@@ -1001,7 +1001,7 @@ namespace Dml
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr)
     {
         Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
-        return pAllocationInfo->GetSubAllocator()->GetAllocationInfo(ptr)->GetUavResource();
+        return pAllocationInfo->GetAllocationInfo(ptr)->GetUavResource();
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)

From d0eb5da576ae4fb24241115a80cba7176bfc7c9e Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 24 Jan 2023 22:11:22 -0800
Subject: [PATCH 13/76] WIP

---
 .../src/BucketizedBufferAllocator.cpp         | 27 -------------------
 .../src/BucketizedBufferAllocator.h           |  1 -
 .../src/DmlAllocationInfo.cpp                 |  4 ---
 .../src/DmlBfcAllocator.h                     |  6 ++---
 .../src/DmlGpuAllocator.cpp                   |  2 +-
 .../src/ExecutionProvider.cpp                 |  2 +-
 6 files changed, 5 insertions(+), 37 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 66405dd5d2989..d3dbe19599bb9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -328,33 +328,6 @@ namespace Dml
         allocations_by_id_.erase(it);
     }
 
-   void BucketizedBufferAllocator::FreeResource(void* p, uint64_t pooledResourceId)
-    {
-        AllocationInfo *allocInfo = static_cast<AllocationInfo*>(p);
-
-        assert(allocInfo != nullptr); // Can't free nullptr
-
-        if (allocInfo->GetOwner() != this)
-        {
-            // This allocation doesn't belong to this allocator!
-            ORT_THROW_HR(E_INVALIDARG);
-        }
-
-        // Free the underlying allocation once queued work has completed.
-#ifdef _GAMING_XBOX
-        m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->DetachResourceWrapper().Get()).Get());
-#else
-        m_context->QueueReference(allocInfo->DetachResourceWrapper().Get());
-#endif
-
-    #if _DEBUG
-        assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
-        m_outstandingAllocationsById.erase(allocInfo->GetId());
-    #endif
-
-        // The allocation info is already destructing at this point
-    }
-
     absl::optional<uint32_t> BucketizedBufferAllocator::TryReserveAllocationID()
     {
         // The mutex must already be held
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index b28bdba544766..16fc28049a583 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -106,7 +106,6 @@ namespace Dml
         static uint64_t GetBucketSizeFromIndex(gsl::index index);
 
         friend class AllocationInfo;
-        void FreeResource(void* p, uint64_t resourceId);
 
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
index 044e9e854d700..9af6933cd3ed7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
@@ -10,10 +10,6 @@ namespace Dml
 
     AllocationInfo::~AllocationInfo()
     {
-        if (m_owner)
-        {
-            m_owner->FreeResource(this, m_pooledResourceId);
-        }
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
index 458a65e63c0c4..f43aa769af0a9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
@@ -11,7 +11,7 @@ namespace Dml
     class DmlBfcAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlBfcAllocator(BucketizedBufferAllocator* subAllocator)
+        DmlBfcAllocator(std::shared_ptr<BucketizedBufferAllocator> subAllocator)
         : onnxruntime::IAllocator(
             OrtMemoryInfo(
                 "DML",
@@ -19,11 +19,11 @@ namespace Dml
                 OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
             )
         ),
-        m_subAllocator(subAllocator) {}
+        m_subAllocator(std::move(subAllocator)) {}
 
         void* Alloc(size_t size_in_bytes) { return m_subAllocator->Alloc(size_in_bytes); }
         void Free(void* ptr) { m_subAllocator->Free(ptr); }
     private:
-        BucketizedBufferAllocator* m_subAllocator;
+        std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 8e8db740b41de..44df1c79aacbe 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -19,7 +19,7 @@ namespace Dml
         )
     ),
     m_bfcAllocator(bfcAllocator),
-    m_subAllocator(std::move(subAllocator)) {}
+    m_subAllocator(subAllocator) {}
 
     void* DmlGpuAllocator::Alloc(size_t size_in_bytes)
     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 1f6aafda1cc9b..a9046f91c76c8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -194,7 +194,7 @@ namespace Dml
         // Create a BFC allocator that encapsulates our allocator
         onnxruntime::AllocatorCreationInfo memoryInfo(
             [subAllocator](OrtDevice::DeviceId id) {
-                return std::make_unique<DmlBfcAllocator>(subAllocator.get());
+                return std::make_unique<DmlBfcAllocator>(subAllocator);
             });
 
         m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo);

From 3385d20a7027023be873e70ad653146256682649 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 26 Jan 2023 11:20:18 -0800
Subject: [PATCH 14/76] Add buffer region size alignment

---
 .../DmlExecutionProvider/src/BucketizedBufferAllocator.cpp   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index d3dbe19599bb9..8438393544740 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -377,6 +377,11 @@ namespace Dml
         auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
         ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
 
+        // Make sure that we are aligned to 4 bytes to satisfy DML's requirements
+        constexpr uint64_t DML_ALIGNMENT = 4;
+        size_in_bytes =
+            (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
+
         return D3D12BufferRegion(
             tagged_ptr.offset,
             size_in_bytes,

From 7e5622d29ec62e6498d2b46c5dd139ea47e802ea Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 26 Jan 2023 15:14:00 -0800
Subject: [PATCH 15/76] WIP

---
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp      | 4 +++-
 .../DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp  | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index a9046f91c76c8..363f5897c98a9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -494,7 +494,9 @@ namespace Dml
                 ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
                 : D3D12_RESOURCE_STATE_COPY_DEST;
 
-            m_context->CopyBufferRegion(dstData, 0, dstState, srcData, 0, srcState, dataSizeInBytes);
+            const uint64_t srcOffset = srcBufferRegion.Offset();
+            const uint64_t dstOffset = dstBufferRegion.Offset();
+            m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes);
         }
         else
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index af983b26772d9..002a8f9192b31 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -50,13 +50,13 @@ class DmlOperatorCopy : public DmlOperator
         MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
 
         // Avoid self copying.
-        if (inputTensor.GetDataInterface().Get() != outputTensor.GetDataInterface().Get())
-        {
+        // if (inputTensor.GetDataInterface().Get() != outputTensor.GetDataInterface().Get())
+        // {
             // Copy elements from input tensor to output tensor.
             ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
                 outputTensor.GetInterface().Get(),
                 inputTensor.GetInterface().Get()));
-        }
+        // }
     }
 };
 

From e6897c50e80038f1b610c9c8932af9e6d4a6078b Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 26 Jan 2023 15:14:41 -0800
Subject: [PATCH 16/76] WIP

---
 .../src/Operators/DmlOperatorCopy.cpp                | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index 002a8f9192b31..4ca51633d23e7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -49,14 +49,10 @@ class DmlOperatorCopy : public DmlOperator
         // Reshape the output tensor.
         MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
 
-        // Avoid self copying.
-        // if (inputTensor.GetDataInterface().Get() != outputTensor.GetDataInterface().Get())
-        // {
-            // Copy elements from input tensor to output tensor.
-            ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
-                outputTensor.GetInterface().Get(),
-                inputTensor.GetInterface().Get()));
-        // }
+        // Copy elements from input tensor to output tensor.
+        ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
+            outputTensor.GetInterface().Get(),
+            inputTensor.GetInterface().Get()));
     }
 };
 

From 2064baa0888443a49c5e6905ae38e76f8f8c02c6 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 27 Jan 2023 02:24:52 -0800
Subject: [PATCH 17/76] WIP

---
 .../inc/IWinmlExecutionProvider.h             |   9 +-
 .../src/BucketizedBufferAllocator.cpp         |  44 +++----
 .../src/BucketizedBufferAllocator.h           |   5 +-
 .../src/DmlAllocationInfo.cpp                 |   4 +
 .../DmlExecutionProvider/src/DmlBuffer.cpp    |  76 +++++++++++
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  47 +++++++
 ...DmlBufferRegion.cc => DmlBufferRegion.cpp} |   0
 .../src/DmlCommandRecorder.cpp                |  26 +---
 .../src/DmlGpuAllocator.cpp                   |  11 +-
 .../src/DmlGpuAllocator.h                     |   4 +-
 .../src/DmlGraphFusionHelper.cpp              |   7 +-
 .../src/DmlManagedBuffer.h                    |  20 +++
 .../src/DmlManagedBufferRegion.h              |  26 ----
 .../src/ExecutionContext.h                    |   2 +
 .../src/ExecutionProvider.cpp                 |  31 +----
 .../src/ExecutionProvider.h                   |  18 +--
 .../src/FusedGraphKernel.cpp                  |  43 ++-----
 .../src/IExecutionProvider.h                  |   4 +-
 .../src/MLOperatorAuthorImpl.cpp              | 120 +++---------------
 .../src/MLOperatorAuthorImpl.h                |  11 +-
 .../src/Operators/DmlDFT.h                    |  32 -----
 .../src/Operators/DmlOperator.cpp             |  39 ++----
 .../src/Operators/DmlOperator.h               |   4 +-
 .../src/dml_buffer_region.h                   |  97 ++++++++++++++
 24 files changed, 342 insertions(+), 338 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
 rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{DmlBufferRegion.cc => DmlBufferRegion.cpp} (100%)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index a56f03e50a9e1..ccde56e5d712d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -39,14 +39,7 @@ namespace Windows::AI::MachineLearning::Adapter
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
 
-        virtual void GetABIDataInterface(
-            void* data,
-            IUnknown** abiData) const = 0;
-
-        virtual void GetManagedBufferRegion(
-            void* data,
-            uint64_t size,
-            Dml::DmlManagedBufferRegion** abiData) const = 0;
+        virtual ID3D12Resource* GetABIDataInterface(void* data) const = 0;
 
         virtual uint64_t TryGetPooledAllocationId(
             void* data,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 8438393544740..b0ddd29e6bf46 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -7,7 +7,6 @@
 #include "BucketizedBufferAllocator.h"
 #include "DmlReservedResourceWrapper.h"
 #include "DmlBufferRegion.h"
-#include "DmlManagedBufferRegion.h"
 
 namespace Dml
 {
@@ -328,6 +327,24 @@ namespace Dml
         allocations_by_id_.erase(it);
     }
 
+    void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId)
+    {
+        // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already
+        // close at this point if the application is winding down.
+        if (!m_context->Closed())
+        {
+            assert(allocInfo != nullptr); // Can't free nullptr
+
+            if (allocInfo->GetOwner() != this)
+            {
+                // This allocation doesn't belong to this allocator!
+                ORT_THROW_HR(E_INVALIDARG);
+            }
+
+            m_context->QueueReference(allocInfo);
+        }
+    }
+
     absl::optional<uint32_t> BucketizedBufferAllocator::TryReserveAllocationID()
     {
         // The mutex must already be held
@@ -390,31 +407,6 @@ namespace Dml
             it->second->GetCopyDstResource());
     }
 
-    ComPtr<DmlManagedBufferRegion> BucketizedBufferAllocator::CreateManagedBufferRegion(
-        const void* ptr,
-        uint64_t size_in_bytes)
-    {
-        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
-
-        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
-
-        // We need to access (mutable) state after this point, so we need to lock
-        std::unique_lock<std::mutex> lock(mutex_);
-
-        // Find the allocation corresponding to this pointer
-        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
-        ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
-
-        D3D12BufferRegion bufferRegion(
-            tagged_ptr.offset,
-            size_in_bytes,
-            it->second->GetUavResource(),
-            it->second->GetCopySrcResource(),
-            it->second->GetCopyDstResource());
-
-        return wil::MakeOrThrow<DmlManagedBufferRegion>(it->second, std::move(bufferRegion));
-    }
-
     AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const void* ptr)
     {
         ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 16fc28049a583..f2c09dfa0cfc4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -63,14 +63,11 @@ namespace Dml
             const void* ptr,
             uint64_t size_in_bytes);
 
-        ComPtr<DmlManagedBufferRegion> CreateManagedBufferRegion(
-            const void* ptr,
-            uint64_t size_in_bytes);
-
         AllocationInfo* GetAllocationInfo(const void* ptr);
 
         void* Alloc(size_t size_in_bytes);
         void Free(void* ptr);
+        void FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId);
         uint64_t ComputeRequiredSize(size_t size);
         bool TilingEnabled() const { return tiling_enabled_; };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
index 9af6933cd3ed7..044e9e854d700 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
@@ -10,6 +10,10 @@ namespace Dml
 
     AllocationInfo::~AllocationInfo()
     {
+        if (m_owner)
+        {
+            m_owner->FreeResource(this, m_pooledResourceId);
+        }
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
new file mode 100644
index 0000000000000..6f587261553e6
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+#include "DmlBuffer.h"
+#include "DmlGpuAllocator.h"
+
+namespace Dml
+{
+
+/*explicit*/ DmlBuffer::DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes)
+    : allocator_(allocator)
+{
+    m_opaqueData = allocator_->Alloc(size_in_bytes);
+    ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr);
+
+    buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes);
+}
+
+DmlBuffer::~DmlBuffer()
+{
+    if (m_opaqueData != nullptr)
+    {
+        allocator_->Free(m_opaqueData);
+    }
+}
+
+DmlBuffer::DmlBuffer(DmlBuffer&& other)
+{
+    m_opaqueData = other.m_opaqueData;
+    allocator_ = other.allocator_;
+    buffer_region_ = std::move(other.buffer_region_);
+    other.m_opaqueData = nullptr;
+}
+
+DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other)
+{
+    m_opaqueData = other.m_opaqueData;
+    allocator_ = other.allocator_;
+    buffer_region_ = std::move(other.buffer_region_);
+    other.m_opaqueData = nullptr;
+    return *this;
+}
+
+ID3D12Resource* DmlBuffer::ResourceInUavState() const
+{
+    return buffer_region_.ResourceInUavState();
+}
+
+ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const
+{
+    return buffer_region_.ResourceInCopySrcState();
+}
+
+ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const
+{
+    return buffer_region_.ResourceInCopyDstState();
+}
+
+uint64_t DmlBuffer::Offset() const
+{
+    return buffer_region_ ? buffer_region_.Offset() : 0;
+}
+
+uint64_t DmlBuffer::SizeInBytes() const
+{
+    return buffer_region_ ? buffer_region_.SizeInBytes() : 0;
+}
+
+DML_BUFFER_BINDING DmlBuffer::GetBufferBinding() const
+{
+    return buffer_region_ ? buffer_region_.GetBufferBinding()
+                          : DML_BUFFER_BINDING{};
+}
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
new file mode 100644
index 0000000000000..b98ae727e1a65
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "DmlBufferRegion.h"
+
+namespace Dml
+{
+
+class DmlGpuAllocator;
+class OpKernelContext;
+
+// Owns a D3D12 default heap buffer allocated using the DML device's
+// allocator. This is essentially a convenience wrapper over a device memory
+// allocation as well as the buffer region that spans it. When this object is
+// destructed, the device memory is freed to the allocator.
+class DmlBuffer
+{
+  public:
+    explicit DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes);
+    ~DmlBuffer();
+
+    // Move-only
+    DmlBuffer(const DmlBuffer&) = delete;
+    DmlBuffer& operator=(const DmlBuffer&) = delete;
+    DmlBuffer(DmlBuffer&&);
+    DmlBuffer& operator=(DmlBuffer&&);
+
+    ID3D12Resource* ResourceInUavState() const;
+    ID3D12Resource* ResourceInCopySrcState() const;
+    ID3D12Resource* ResourceInCopyDstState() const;
+    uint64_t Offset() const;
+    uint64_t SizeInBytes() const;
+    const D3D12BufferRegion& Region() const { return buffer_region_; }
+
+    DML_BUFFER_BINDING GetBufferBinding() const;
+
+    explicit operator bool() const { return !!buffer_region_; }
+
+  private:
+    DmlGpuAllocator* allocator_;
+    D3D12BufferRegion buffer_region_;
+    void* m_opaqueData;
+};
+
+} // namespace tfdml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
similarity index 100%
rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cc
rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index f60d11fcebf4d..af625334b7720 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -4,7 +4,6 @@
 #include "precomp.h"
 #include "DmlCommandRecorder.h"
 #include "CommandQueue.h"
-#include "absl/cleanup/cleanup.h"
 
 using namespace Dml;
 
@@ -62,21 +61,12 @@ void DmlCommandRecorder::InitializeOperator(
 
         // Allocate and immediately free a temporary buffer. The buffer resource will still be
         // alive (managed by the pool); freeing allows the resource to be shared with other operators.
-        void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize));
-        if (!tempResourceHandle)
-        {
-            ORT_THROW_HR(E_OUTOFMEMORY);
-        }
-        absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); });
-
-        auto subAllocator = m_allocator.lock();
-        auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize);
+        auto buffer = allocator->AllocateDefaultBuffer(temporaryResourceSize);
 
         // Bind the temporary resource.
-        DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding();
+        DML_BUFFER_BINDING bufferBinding = buffer.GetBufferBinding();
         DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
         bindingTable->BindTemporaryResource(&bindingDesc);
-        allocator->Free(tempResourceHandle);
     }
 
     // Bind inputs, if provided.
@@ -142,18 +132,10 @@ void DmlCommandRecorder::ExecuteOperator(
 
         // Allocate and immediately free a temporary buffer. The buffer resource will still be
         // alive (managed by the pool); freeing allows the resource to be shared with other operators.
-        void* tempResourceHandle = allocator->Alloc(static_cast<size_t>(temporaryResourceSize));
-        if (!tempResourceHandle)
-        {
-            ORT_THROW_HR(E_OUTOFMEMORY);
-        }
-        absl::Cleanup([allocator, tempResourceHandle]() { allocator->Free(tempResourceHandle); });
-
-        auto subAllocator = m_allocator.lock();
-        auto bufferRegion = subAllocator->CreateBufferRegion(tempResourceHandle, temporaryResourceSize);
+        auto buffer = allocator->AllocateDefaultBuffer(temporaryResourceSize);
 
         // Bind the temporary resource.
-        DML_BUFFER_BINDING bufferBinding = bufferRegion.GetBufferBinding();
+        DML_BUFFER_BINDING bufferBinding = buffer.GetBufferBinding();
         DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &bufferBinding };
         bindingTable->BindTemporaryResource(&bindingDesc);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 44df1c79aacbe..13e0d8dfe96f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -36,11 +36,6 @@ namespace Dml
         return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes);
     }
 
-    ComPtr<DmlManagedBufferRegion> DmlGpuAllocator::CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes)
-    {
-        return m_subAllocator->CreateManagedBufferRegion(ptr, size_in_bytes);
-    }
-
     AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const void* ptr)
     {
         return m_subAllocator->GetAllocationInfo(ptr);
@@ -50,4 +45,10 @@ namespace Dml
     {
         m_subAllocator->SetDefaultRoundingMode(roundingMode);
     }
+
+    DmlBuffer DmlGpuAllocator::AllocateDefaultBuffer(uint64_t num_bytes)
+    {
+        return DmlBuffer(this, num_bytes);
+    }
+
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index b12c990d44565..5ef9ea855753f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -5,7 +5,7 @@
 
 #include "core/framework/allocator.h"
 #include "DmlBufferRegion.h"
-#include "DmlManagedBufferRegion.h"
+#include "DmlBuffer.h"
 
 namespace Dml
 {
@@ -20,9 +20,9 @@ namespace Dml
         void* Alloc(size_t size_in_bytes) final;
         void Free(void* ptr) final;
         D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes);
-        ComPtr<DmlManagedBufferRegion> CreateManagedBufferRegion(const void* ptr, uint64_t size_in_bytes);
         AllocationInfo* GetAllocationInfo(const void* ptr);
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
+        DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes);
 
     private:
         // This allocator is managed by ORT and should be used to allocate/free memory in order
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 890c5aa1ae384..ffd388f91cace 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -95,12 +95,11 @@ namespace DmlGraphFusionHelper
         uint64_t* allocId)
     {
         void* opaqueData = const_cast<void*>(tensor->DataRaw());
-        Microsoft::WRL::ComPtr<IUnknown> resourceUnk;
-        winmlProvider->GetABIDataInterface(opaqueData, &resourceUnk);
+        ID3D12Resource* abiDataInterface = winmlProvider->GetABIDataInterface(opaqueData);
+        abiDataInterface->AddRef();
 
         *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0);
-
-        ORT_THROW_IF_FAILED(resourceUnk->QueryInterface(resource));
+        *resource = abiDataInterface;
     }
 
     void ProcessInputData(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h
new file mode 100644
index 0000000000000..ced81af68e92e
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBuffer.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "DmlBuffer.h"
+
+namespace Dml
+{
+    // Light wrapper around DmlBuffer used with CommandQueue::QueueReference to keep a reference on the buffer until GPU work is completed
+    class DmlManagedBuffer : public Microsoft::WRL::RuntimeClass<Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
+    {
+    public:
+        DmlManagedBuffer(DmlBuffer&& buffer) : m_buffer(std::move(buffer)) {}
+        uint64_t SizeInBytes() const { return m_buffer.SizeInBytes(); }
+
+    private:
+        DmlBuffer m_buffer;
+    };
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h
deleted file mode 100644
index de39f0890f998..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlManagedBufferRegion.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "DmlBufferRegion.h"
-#include "DmlAllocationInfo.h"
-
-namespace Dml
-{
-    class DmlManagedBufferRegion : public Microsoft::WRL::RuntimeClass<Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
-    {
-    public:
-        DmlManagedBufferRegion(Microsoft::WRL::ComPtr<AllocationInfo> allocation, D3D12BufferRegion&& bufferRegion)
-            : m_allocation(std::move(allocation)),
-              m_bufferRegion(std::move(bufferRegion))
-        {
-        }
-
-        const D3D12BufferRegion& GetBufferRegion() const { return m_bufferRegion; }
-
-    private:
-        Microsoft::WRL::ComPtr<AllocationInfo> m_allocation;
-        D3D12BufferRegion m_bufferRegion;
-    };
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index 6e2d205f48ebd..e4ef79081ad14 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -87,6 +87,8 @@ namespace Dml
 
         D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
 
+        bool Closed() const { return m_closed; }
+
     private:
         ComPtr<ID3D12Device> m_d3dDevice;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 363f5897c98a9..20f8feed12311 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -20,9 +20,9 @@
 #include "core/framework/fallback_cpu_capability.h"
 #include "DmlCommittedResourceWrapper.h"
 #include "DmlBufferRegion.h"
-#include "DmlManagedBufferRegion.h"
 #include "DmlBfcAllocator.h"
 #include "DmlGpuAllocator.h"
+#include "DmlBuffer.h"
 
 #ifdef ERROR
 #undef ERROR
@@ -113,19 +113,9 @@ namespace Dml
         m_context->GetCurrentCompletionEvent().WaitForSignal();
     }
 
-    HRESULT __stdcall ExecutionProviderImpl::AllocatePooledResource(
-        size_t size,
-        DmlManagedBufferRegion** managedBufferRegion
-    ) const noexcept
+    DmlBuffer ExecutionProviderImpl::AllocatePooledResource(size_t size) const
     {
-        ORT_TRY
-        {
-        void* opaqueData = m_gpuAllocator->Alloc(size);
-        auto bufferRegion = m_gpuAllocator->CreateManagedBufferRegion(opaqueData, size);
-        bufferRegion.CopyTo(managedBufferRegion);
-        return S_OK;
-        }
-        ORT_CATCH_RETURN
+        return m_gpuAllocator->AllocateDefaultBuffer(size);
     }
 
     D3D12BufferRegion ExecutionProviderImpl::GetBufferForTensor(IMLOperatorTensor* tensor) const
@@ -860,18 +850,9 @@ namespace Dml
         m_context->QueueReference(object);
     }
 
-    void ExecutionProviderImpl::GetABIDataInterface(void* data, IUnknown** abiData) const
-    {
-        assert(!m_closed);
-        auto uavResource = m_gpuAllocator->GetAllocationInfo(data)->GetUavResource();
-        uavResource->AddRef();
-        *abiData = uavResource;
-    }
-
-    void ExecutionProviderImpl::GetManagedBufferRegion(void* data, uint64_t size, DmlManagedBufferRegion** abiData) const
+    ID3D12Resource* ExecutionProviderImpl::GetABIDataInterface(void* data) const
     {
-        auto managedBufferRegion = m_gpuAllocator->CreateManagedBufferRegion(data, size);
-        ORT_THROW_IF_FAILED(managedBufferRegion.CopyTo(abiData));
+        return m_gpuAllocator->GetAllocationInfo(data)->GetUavResource();
     }
 
     uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator)
@@ -965,7 +946,7 @@ namespace Dml
 
     std::shared_ptr<onnxruntime::IAllocator> ExecutionProviderImpl::GetGpuAllocator()
     {
-        return m_bfcAllocator;
+        return m_gpuAllocator;
     }
 
     std::shared_ptr<onnxruntime::IAllocator> ExecutionProviderImpl::GetCpuInputAllocator()
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index eec8f08848833..20cb307b1cdb4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -6,6 +6,7 @@
 #include "GraphTransformer.h"
 #include "core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h"
 #include "DmlBufferRegion.h"
+#include "DmlBuffer.h"
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
@@ -26,7 +27,6 @@ namespace Dml
     class BucketizedBufferAllocator;
     class DmlCpuAllocator;
     class ExecutionProvider;
-    class DmlManagedBufferRegion;
     class DmlGpuAllocator;
 
     class ExecutionProviderImpl : public WRL::Base<Dml::IExecutionProvider,
@@ -97,14 +97,7 @@ namespace Dml
         // IWinmlExecutionProvider methods
         void QueueReference(IUnknown* object) override;
 
-        void GetABIDataInterface(
-            void* data,
-            IUnknown** abiData) const override;
-
-        void GetManagedBufferRegion(
-            void* data,
-            uint64_t size,
-            DmlManagedBufferRegion** abiData) const;
+        ID3D12Resource* GetABIDataInterface(void* data) const override;
 
        uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override;
 
@@ -133,11 +126,8 @@ namespace Dml
 
         void WaitForOutstandingWork();
 
-        // Allocate a resource from pools.  Releasing pooledResource returns it to the pool.
-        STDMETHOD(AllocatePooledResource)(
-            size_t size,
-            DmlManagedBufferRegion** managedBufferRegion
-        ) const noexcept final;
+        // Allocate a resource from pools.  Releasing the returned buffer returns it to the pool.
+        DmlBuffer ExecutionProviderImpl::AllocatePooledResource(size_t size) const;
 
         STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept final;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 8ff33debe2474..9ecaae2c50394 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -6,7 +6,7 @@
 #include "MLOperatorAuthorImpl.h"
 #include "FusedGraphKernel.h"
 #include "DmlGraphFusionHelper.h"
-#include "DmlManagedBufferRegion.h"
+#include "DmlManagedBuffer.h"
 
 using namespace Windows::AI::MachineLearning::Adapter;
 
@@ -64,14 +64,10 @@ namespace Dml
             UINT64 persistentResourceSize = m_compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
-                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
-                ORT_THROW_IF_FAILED(m_provider->AllocatePooledResource(
-                    static_cast<size_t>(persistentResourceSize),
-                    managedBufferRegion.GetAddressOf()));
-
-                managedBufferRegion.As(&m_persistentResourceAllocatorUnk);
-                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
-                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
+                auto buffer = m_provider->AllocatePooledResource(persistentResourceSize);
+                m_persistentResourceBinding = buffer.GetBufferBinding();
+                m_managedPersistentBuffer = wil::MakeOrThrow<DmlManagedBuffer>(std::move(buffer));
+                m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get());
             }
 
             ORT_THROW_IF_FAILED(m_provider->InitializeOperator(
@@ -81,7 +77,6 @@ namespace Dml
 
             // Queue references to objects which must be kept alive until resulting GPU work completes
             m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get());
-            m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get());
 
             std::for_each(
                 initializeResourceRefs.begin(),
@@ -145,7 +140,7 @@ namespace Dml
 
                 // Queue references to objects which must be kept alive until resulting GPU work completes
                 m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get());
-                m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get());
+                m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get());
             }
             else
             {
@@ -359,25 +354,10 @@ namespace Dml
 
             if (execBindingProps.TemporaryResourceSize > 0)
             {
-                // Allocate temporary data which will automatically be freed when the GPU work
-                // which is scheduled up to the point that this method returns has completed.
-                ComPtr<IUnknown> tempAlloc;
-                uint64_t tempAllocId = 0;
-                ORT_THROW_IF_FAILED(contextWrapper.AllocateTemporaryData(static_cast<size_t>(execBindingProps.TemporaryResourceSize), tempAlloc.GetAddressOf(), &tempAllocId));
-
-                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
-                m_winmlProvider->GetManagedBufferRegion(tempAlloc.Get(), execBindingProps.TemporaryResourceSize, &managedBufferRegion);
-
-                // Bind the temporary resource.
-                DML_BUFFER_BINDING tempBufferBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
+                auto buffer = m_provider->AllocatePooledResource(execBindingProps.TemporaryResourceSize);
+                DML_BUFFER_BINDING tempBufferBinding = buffer.GetBufferBinding();
                 DML_BINDING_DESC tempBindingDesc = { DML_BINDING_TYPE_BUFFER, &tempBufferBinding };
-
-                if (!tempAllocId || m_tempBindingAllocId != tempAllocId)
-                {
-                    m_bindingTable->BindTemporaryResource(&tempBindingDesc);
-                }
-
-                m_tempBindingAllocId = tempAllocId;
+                m_bindingTable->BindTemporaryResource(&tempBindingDesc);
             }
 
             // Execute the command list and if it succeeds, update the fence value at which this command may be
@@ -401,7 +381,7 @@ namespace Dml
             m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(m_graphicsCommandList).Get());
             m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(m_heap).Get());
             m_winmlProvider->QueueReference(m_bindingTable.Get());
-            m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get());
+            m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get());
         }
 
         ComPtr<IDMLCompiledOperator> m_compiledExecutionPlanOperator;
@@ -418,12 +398,11 @@ namespace Dml
         ComPtr<IDMLBindingTable> m_bindingTable;
         std::optional<DML_BUFFER_BINDING> m_persistentResourceBinding;
         ComPtr<ID3D12Resource> m_persistentResource;
-        ComPtr<IUnknown> m_persistentResourceAllocatorUnk; // Controls when the persistent resource is returned to the allocator
+        ComPtr<DmlManagedBuffer> m_managedPersistentBuffer;
 
         // Bindings from previous executions of a re-used command list
         mutable std::vector<uint64_t> m_inputBindingAllocIds;
         mutable std::vector<uint64_t> m_outputBindingAllocIds;
-        mutable uint64_t m_tempBindingAllocId = 0;
 
         // Fence tracking the status of the command list's last execution, and whether its descriptor heap
         // can safely be updated.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
index 4bef0652763a9..61cd34339f04a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
@@ -3,6 +3,7 @@
 
 #pragma once
 #include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
+#include "DmlBuffer.h"
 
 namespace Dml
 {
@@ -67,9 +68,10 @@ namespace Dml
         STDMETHOD_(void, Flush)() const noexcept = 0;
 
         STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept = 0;
-        STDMETHOD(AllocatePooledResource(size_t size, DmlManagedBufferRegion** pooledResource)) const noexcept = 0;
 
         STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0;
         STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0;
+
+        virtual DmlBuffer AllocatePooledResource(size_t size) const = 0;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index d601a2b3b4025..55a33cc8513b5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -10,6 +10,7 @@
 
 #include "MLOperatorAuthorImpl.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h"
 
 using namespace Microsoft::WRL;
 
@@ -102,13 +103,9 @@ namespace Windows::AI::MachineLearning::Adapter
     // Translate the data object stored in a tensor to the type which will be returned through
     // the ABI. The translation is determined by the provider and based on options with which the
     // kernels are registered.
-    void TranslateAllocationDataToAbi(
-        IWinmlExecutionProvider* winmlProvider,
-        const ::OrtMemoryInfo& allocInfo,
-        void* opaqueData,
-        IUnknown** abiAllocation)
+    ID3D12Resource* TranslateAllocationDataToAbi(IWinmlExecutionProvider* winmlProvider, void* opaqueData)
     {
-        winmlProvider->GetABIDataInterface(opaqueData, abiAllocation);
+        return winmlProvider->GetABIDataInterface(opaqueData);
     }
 
     //
@@ -1281,11 +1278,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 {
                     // Get the actual object to be returned from the ABI, which varies for internal and external
                     // kernels (i.e. ID3D12Resource, versus something that tracks the layout).
-                    TranslateAllocationDataToAbi(
-                        m_winmlExecutionProvider.Get(),
-                        m_impl->Location(),
-                        m_tensorData,
-                        m_abiDataInterface.GetAddressOf());
+                    m_abiDataInterface = TranslateAllocationDataToAbi(m_winmlExecutionProvider.Get(), m_tensorData);
                 }
             }
         }
@@ -1377,55 +1370,8 @@ namespace Windows::AI::MachineLearning::Adapter
         }
         else
         {
-            m_abiDataInterface.CopyTo(dataInterface);
-        }
-    }
-
-    void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp)
-    {
-        if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
-        {
-            std::vector<IUnknown*> resourcesToTransition;
-            resourcesToTransition.reserve(m_inputTensors.size() + m_outputTensors.size() + m_temporaryAllocations.size());
-
-            for (uint32_t i = 0; i < m_inputTensors.size(); ++i)
-            {
-                ComPtr<IMLOperatorTensor> tensor;
-                ORT_THROW_IF_FAILED(GetInputTensor(i, tensor.GetAddressOf()));
-
-                if (tensor)
-                {
-                    ComPtr<IUnknown> resource;
-                    tensor->GetDataInterface(resource.GetAddressOf());
-                    if (resource)
-                    {
-                        resourcesToTransition.push_back(resource.Get());
-                    }
-                }
-            }
-
-            for (uint32_t i = 0; i < m_outputTensors.size(); ++i)
-            {
-                ComPtr<IMLOperatorTensor> tensor;
-                ORT_THROW_IF_FAILED(GetOutputTensor(i, tensor.GetAddressOf()));
-
-                ComPtr<IUnknown> resource;
-                tensor->GetDataInterface(resource.GetAddressOf());
-                if (resource)
-                {
-                    resourcesToTransition.push_back(resource.Get());
-                }
-            }
-
-            for (auto& tempAlloc : m_temporaryAbiAllocations)
-            {
-                resourcesToTransition.push_back(tempAlloc.Get());
-            }
-
-            m_winmlProvider->TransitionResourcesForOperator(
-                isBeforeOp,
-                gsl::narrow_cast<uint32_t>(resourcesToTransition.size()),
-                resourcesToTransition.data());
+            m_abiDataInterface->AddRef();
+            *dataInterface = m_abiDataInterface;
         }
     }
 
@@ -1457,8 +1403,6 @@ namespace Windows::AI::MachineLearning::Adapter
             {
                 m_winmlProvider->GetABIExecutionInterfaceAndInvalidateState(isInternalOperator, m_abiExecutionObject.ReleaseAndGetAddressOf());
             }
-
-            TransitionResourcesForOperatorIfRequired(true);
         }
     }
 
@@ -1471,18 +1415,12 @@ namespace Windows::AI::MachineLearning::Adapter
     {
         if (m_winmlProvider)
         {
-            m_temporaryAllocations.clear();
-            m_temporaryAbiAllocations.clear();
+            m_temporaryBuffers.clear();
         }
     }
 
     void OpKernelContextWrapper::Close()
     {
-        if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
-        {
-            TransitionResourcesForOperatorIfRequired(false);
-        }
-
         for (auto& tensor : m_inputTensors)
         {
             if (tensor)
@@ -1610,16 +1548,6 @@ namespace Windows::AI::MachineLearning::Adapter
     }
 
     HRESULT STDMETHODCALLTYPE OpKernelContextWrapper::AllocateTemporaryData(size_t size, IUnknown** abiAllocation) const
-    {
-        ORT_TRY
-        {
-            uint64_t allocId;
-            return AllocateTemporaryData(size, abiAllocation, &allocId);
-        }
-        ORT_CATCH_RETURN
-    }
-
-    HRESULT STDMETHODCALLTYPE OpKernelContextWrapper::AllocateTemporaryData(size_t size, IUnknown** abiAllocation, uint64_t* allocId) const
     {
         ORT_TRY
         {
@@ -1634,21 +1562,13 @@ namespace Windows::AI::MachineLearning::Adapter
                 return E_FAIL;
             }
 
-            ComPtr<IUnknown> allocation;
-            allocation.Attach(static_cast<IUnknown*>(alloc->Alloc(size)));
-
-            *allocId = m_winmlProvider->TryGetPooledAllocationId(allocation.Get(), 0);
-
-            TranslateAllocationDataToAbi(m_winmlProvider.Get(), alloc->Info(), allocation.Get(), abiAllocation);
-
-            if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
-            {
-                m_winmlProvider->TransitionResourcesForOperator(true, 1, abiAllocation);
-            }
+            auto dml_gpu_allocator = static_cast<Dml::DmlGpuAllocator*>(alloc.get());
+            auto buffer = dml_gpu_allocator->AllocateDefaultBuffer(size);
+            buffer.ResourceInUavState()->AddRef();
+            *abiAllocation = buffer.ResourceInUavState();
 
             // Ensure the allocation is freed and transitioned when the context destructs
-            m_temporaryAllocations.push_back(allocation);
-            m_temporaryAbiAllocations.push_back(*abiAllocation);
+            m_temporaryBuffers.push_back(std::move(buffer));
 
             return S_OK;
         }
@@ -1953,14 +1873,16 @@ namespace Windows::AI::MachineLearning::Adapter
             }
         }
 
-        ComPtr<OpKernelContextWrapper> kernelContextWrapper = wil::MakeOrThrow<OpKernelContextWrapper>(
-            context,
-            Info().GetExecutionProvider(),
-            m_internalOperator,
-            m_requiresOutputShapesAtCreation ? &m_inferredOutputShapes : nullptr);
+        {
+            ComPtr<OpKernelContextWrapper> kernelContextWrapper = wil::MakeOrThrow<OpKernelContextWrapper>(
+                context,
+                Info().GetExecutionProvider(),
+                m_internalOperator,
+                m_requiresOutputShapesAtCreation ? &m_inferredOutputShapes : nullptr);
 
-        ORT_THROW_IF_FAILED(m_kernel->Compute(kernelContextWrapper.Get()));
-        kernelContextWrapper->Close();
+            ORT_THROW_IF_FAILED(m_kernel->Compute(kernelContextWrapper.Get()));
+            kernelContextWrapper->Close();
+        }
 
         // Ensure that scheduled work, if any, is completed before freeing the kernel if the execution
         // provider requires this.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 7e308989791f8..31f7e3fbeee8b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -9,6 +9,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include <wrl/client.h>
 #include <wrl/implements.h>
+#include "core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h"
 
 interface IDMLOperator;
 
@@ -285,10 +286,7 @@ class TensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
     void* m_tensorData = nullptr;
     bool m_isDataInterface = false;
 
-    // The returned data may be a converted shadow copy, and the piece of it which
-    // is returned may vary according to kernel registration options.
-    ComPtr<IUnknown> m_abiDataInterface;
-
+    ID3D12Resource* m_abiDataInterface;
 };
 
 class OnnxTensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
@@ -449,9 +447,7 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext>, publi
     HRESULT STDMETHODCALLTYPE GetInputTensor(uint32_t inputIndex, IMLOperatorTensor** tensor) const noexcept override;
     HRESULT STDMETHODCALLTYPE GetOutputTensor(uint32_t outputIndex, IMLOperatorTensor** tensor) noexcept override;
     HRESULT STDMETHODCALLTYPE GetOutputTensor(uint32_t outputIndex, uint32_t dimensions, const uint32_t* dimensionSizes, IMLOperatorTensor** tensor) noexcept override;
-
     HRESULT STDMETHODCALLTYPE AllocateTemporaryData(size_t size, IUnknown** data) const;
-    HRESULT STDMETHODCALLTYPE AllocateTemporaryData(size_t size, IUnknown** data, uint64_t* allocId) const;
 
     void STDMETHODCALLTYPE GetExecutionInterface(IUnknown** executionInterface) const noexcept override;
 
@@ -481,8 +477,7 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext>, publi
 
     // Temporary allocations created by the kernel.  These will be freed to the allocator following
     // Compute being called on the kernel.  This list is used to maintain their lifetime.
-    mutable std::vector<ComPtr<IUnknown>> m_temporaryAllocations;
-    mutable std::vector<ComPtr<IUnknown>> m_temporaryAbiAllocations;
+    mutable std::vector<Dml::DmlBuffer> m_temporaryBuffers;
 };
 
 class AbiOpKernel : public onnxruntime::OpKernel
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index be5b6a1fe9ada..e545f33fdb8d5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -416,23 +416,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         auto inputResource = loopList[0].Resource.Get();
         auto outputResource = loopList[stockhamParams.OutputIndex].Resource.Get();
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[2];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputResource,
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputResource,
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(2, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_rootSignature.Get());
         commandList->SetPipelineState(m_pipelineState.Get());
@@ -471,21 +454,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             constants.DFTIteration = index + 1;
             Dispatch(in, out, constants, commandList);
         }
-
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-                inputResource,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                D3D12_RESOURCE_STATE_COMMON
-                );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-                outputResource,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                D3D12_RESOURCE_STATE_COMMON
-                );
-
-        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 2d99c8a6dd6df..3801ff6c28404 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -3,7 +3,7 @@
 
 #include "precomp.h"
 #include "DmlOperator.h"
-#include "../DmlManagedBufferRegion.h"
+#include "../DmlManagedBuffer.h"
 
 namespace Dml
 {
@@ -94,14 +94,9 @@ namespace Dml
             UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
-                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
-                ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource(
-                    static_cast<size_t>(persistentResourceSize),
-                    managedBufferRegion.GetAddressOf()));
-
-                managedBufferRegion.As(&m_persistentResourcePoolingUnk);
-                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
-                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
+                auto buffer = m_executionProvider->AllocatePooledResource(persistentResourceSize);
+                m_persistentResourceBinding = buffer.GetBufferBinding();
+                m_managedPersistentBuffer = wil::MakeOrThrow<DmlManagedBuffer>(std::move(buffer));
             }
 
             std::vector<DML_BUFFER_BINDING> initializationInputBindings(m_kernelInputIndices.size());
@@ -194,14 +189,9 @@ namespace Dml
             UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
-                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
-                ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource(
-                    static_cast<size_t>(persistentResourceSize),
-                    managedBufferRegion.GetAddressOf()));
-
-                managedBufferRegion.As(&m_persistentResourcePoolingUnk);
-                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
-                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
+                auto buffer = m_executionProvider->AllocatePooledResource(persistentResourceSize);
+                m_persistentResourceBinding = buffer.GetBufferBinding();
+                m_managedPersistentBuffer = wil::MakeOrThrow<DmlManagedBuffer>(std::move(buffer));
             }
 
             std::vector<DML_BUFFER_BINDING> initializationInputBindings(m_kernelInputIndices.size());
@@ -229,18 +219,11 @@ namespace Dml
         UINT64 persistentResourceSize = m_compiledOperator->GetBindingProperties().PersistentResourceSize;
         if (persistentResourceSize > 0)
         {
-            if (!m_persistentResource || m_persistentResource->GetDesc().Width < persistentResourceSize)
+            if (!m_managedPersistentBuffer || m_managedPersistentBuffer->SizeInBytes() < persistentResourceSize)
             {
-                m_persistentResource = nullptr;
-
-                ComPtr<DmlManagedBufferRegion> managedBufferRegion;
-                ORT_THROW_IF_FAILED(m_executionProvider->AllocatePooledResource(
-                    static_cast<size_t>(persistentResourceSize),
-                    managedBufferRegion.GetAddressOf()));
-
-                managedBufferRegion.As(&m_persistentResourcePoolingUnk);
-                m_persistentResource = managedBufferRegion->GetBufferRegion().ResourceInUavState();
-                m_persistentResourceBinding = managedBufferRegion->GetBufferRegion().GetBufferBinding();
+                auto buffer = m_executionProvider->AllocatePooledResource(persistentResourceSize);
+                m_persistentResourceBinding = buffer.GetBufferBinding();
+                m_managedPersistentBuffer = wil::MakeOrThrow<DmlManagedBuffer>(std::move(buffer));
             }
         }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index 493cc2e44577a..a5f880dd0ec24 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "OperatorUtility.h"
+#include "../DmlManagedBuffer.h"
 
 namespace Dml
 {
@@ -25,8 +26,7 @@ namespace Dml
         std::vector<TensorDesc> m_outputTensorDescs;
 
         ComPtr<IDMLCompiledOperator> m_compiledOperator;
-        ComPtr<ID3D12Resource> m_persistentResource;
-        ComPtr<IUnknown> m_persistentResourcePoolingUnk; // Controls when the persistent resource is returned to the pool
+        ComPtr<DmlManagedBuffer> m_managedPersistentBuffer;
         std::optional<DML_BUFFER_BINDING> m_persistentResourceBinding;
 
         void Initialize(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h
new file mode 100644
index 0000000000000..bff622726219a
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h
@@ -0,0 +1,97 @@
+/* Copyright (c) Microsoft Corporation.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "dml_common.h"
+#include "tfdml/runtime_adapter/macros.h"
+
+namespace tfdml
+{
+
+class D3D12HeapAllocator;
+
+// Represents a region of a D3D12 buffer resource. A buffer region has an
+// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in
+// bytes from the beginning of that buffer, and a size in bytes of the region.
+class D3D12BufferRegion
+{
+  public:
+    D3D12BufferRegion() = default;
+
+    // References a region of a buffer. The respective ID3D12Resource objects
+    // must be in the appropriate states. Each resource is optional, but if more
+    // than one are provided they must map to the same region of memory.
+    D3D12BufferRegion(
+        uint64_t offset,
+        uint64_t size_in_bytes,
+        ID3D12Resource* resource_uav_state,
+        ID3D12Resource* resource_copy_src_state,
+        ID3D12Resource* resource_copy_dst_state);
+
+    // Move-only
+    D3D12BufferRegion(const D3D12BufferRegion&) = delete;
+    D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete;
+    D3D12BufferRegion(D3D12BufferRegion&&);
+    D3D12BufferRegion& operator=(D3D12BufferRegion&&);
+
+    ID3D12Resource* ResourceInUavState() const;
+
+    // NOTE: may be any state that is valid as a copy source (COPY_SRC,
+    // GENERIC_READ, or COMMON).
+    ID3D12Resource* ResourceInCopySrcState() const;
+
+    ID3D12Resource* ResourceInCopyDstState() const;
+
+    uint64_t Offset() const;
+    uint64_t SizeInBytes() const;
+
+    DML_BUFFER_BINDING GetBufferBinding() const;
+
+    explicit operator bool() const { return first_valid_resource_ != nullptr; }
+
+    // Creates a subregion at an offset from the start of this region. If no
+    // size is provided the region runs to the end of the current region.
+    inline D3D12BufferRegion Subregion(
+        uint64_t offset,
+        uint64_t size_in_bytes = 0) const
+    {
+        // start of subregion must be within current region
+        CHECK(offset < size_in_bytes_);
+        size_in_bytes =
+            size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+        // end of subregion must be within current region
+        CHECK(size_in_bytes <= size_in_bytes_ - offset);
+
+        return D3D12BufferRegion(
+            offset_ + offset,
+            size_in_bytes,
+            resource_uav_state_,
+            resource_copy_src_state_,
+            resource_copy_dst_state_);
+    }
+
+  private:
+    ID3D12Resource* resource_uav_state_ = nullptr;
+    ID3D12Resource* resource_copy_src_state_ = nullptr;
+    ID3D12Resource* resource_copy_dst_state_ = nullptr;
+    uint64_t offset_ = 0;
+    uint64_t size_in_bytes_ = 0;
+
+    // Pointer to the first resource above that isn't null.
+    ID3D12Resource* first_valid_resource_ = nullptr;
+};
+
+} // namespace tfdml

From b71a5ffa16f4d0eecdccfca007e65744f51961d6 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 27 Jan 2023 14:16:33 -0800
Subject: [PATCH 18/76] WIP

---
 .../inc/IWinmlExecutionProvider.h             |  8 +-
 .../src/DmlBufferRegion.h                     |  4 +-
 .../src/DmlGraphFusionHelper.cpp              |  9 +-
 .../src/DmlGraphFusionHelper.h                |  4 +-
 .../src/ExecutionProvider.cpp                 |  4 +-
 .../src/ExecutionProvider.h                   |  2 +-
 .../src/FusedGraphKernel.cpp                  | 11 +--
 .../src/MLOperatorAuthorImpl.cpp              | 47 +++++----
 .../src/MLOperatorAuthorImpl.h                |  5 +
 .../src/Operators/DmlDFT.h                    | 58 ++++-------
 .../src/dml_buffer_region.h                   | 97 -------------------
 11 files changed, 68 insertions(+), 181 deletions(-)
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index ccde56e5d712d..198d38c348c87 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -9,6 +9,7 @@
 #include <optional>
 
 #include "core/framework/op_kernel.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h"
 
 struct AbstractOperatorDesc;
 interface IMLOperatorTensor;
@@ -22,11 +23,6 @@ namespace onnxruntime
     class Node;
 }
 
-namespace Dml
-{
-    class DmlManagedBufferRegion;
-}
-
 namespace Windows::AI::MachineLearning::Adapter
 {
     interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698"))
@@ -39,7 +35,7 @@ namespace Windows::AI::MachineLearning::Adapter
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
 
-        virtual ID3D12Resource* GetABIDataInterface(void* data) const = 0;
+        virtual Dml::D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const = 0;
 
         virtual uint64_t TryGetPooledAllocationId(
             void* data,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index 29a6bf6f7c775..dee01a29fe55f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -24,8 +24,8 @@ namespace Dml
             ID3D12Resource* resource_copy_dst_state);
 
         // Move-only
-        D3D12BufferRegion(const D3D12BufferRegion&) = delete;
-        D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete;
+        D3D12BufferRegion(const D3D12BufferRegion&) = default;
+        D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
         D3D12BufferRegion(D3D12BufferRegion&&);
         D3D12BufferRegion& operator=(D3D12BufferRegion&&);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index ffd388f91cace..58dd7314b929f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "DmlGraphFusionHelper.h"
+#include "DmlBufferRegion.h"
 
 
 namespace Dml
@@ -88,18 +89,14 @@ namespace DmlGraphFusionHelper
         return buffer;
     }
 
-    void UnwrapTensor(
+    D3D12BufferRegion UnwrapTensor(
         Windows::AI::MachineLearning::Adapter::IWinmlExecutionProvider* winmlProvider,
         const onnxruntime::Tensor* tensor,
-        ID3D12Resource** resource,
         uint64_t* allocId)
     {
         void* opaqueData = const_cast<void*>(tensor->DataRaw());
-        ID3D12Resource* abiDataInterface = winmlProvider->GetABIDataInterface(opaqueData);
-        abiDataInterface->AddRef();
-
         *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0);
-        *resource = abiDataInterface;
+        return winmlProvider->GetBufferRegion(opaqueData, tensor->SizeInBytes());
     }
 
     void ProcessInputData(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
index f2533bb37bccb..593bd9b563ab6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
@@ -6,6 +6,7 @@
 #include "GraphPartitioner.h"
 #include "FusedGraphKernel.h"
 #include "MLOperatorAuthorImpl.h"
+#include "DmlBufferRegion.h"
 
 
 namespace Dml
@@ -33,10 +34,9 @@ namespace DmlGraphFusionHelper
         const std::byte* tensorPtr,
         size_t tensorByteSize);
 
-    void UnwrapTensor(
+    D3D12BufferRegion UnwrapTensor(
         Windows::AI::MachineLearning::Adapter::IWinmlExecutionProvider* winmlProvider,
         const onnxruntime::Tensor* tensor,
-        ID3D12Resource** resource,
         uint64_t* allocId);
 
     std::unordered_map<const onnx::TensorProto*, std::vector<uint32_t>>
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 20f8feed12311..0f4fe3788cbd0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -850,9 +850,9 @@ namespace Dml
         m_context->QueueReference(object);
     }
 
-    ID3D12Resource* ExecutionProviderImpl::GetABIDataInterface(void* data) const
+    D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(void* data, uint64_t size) const
     {
-        return m_gpuAllocator->GetAllocationInfo(data)->GetUavResource();
+        return m_gpuAllocator->CreateBufferRegion(data, size);
     }
 
     uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 20cb307b1cdb4..5ffced302d5c5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -97,7 +97,7 @@ namespace Dml
         // IWinmlExecutionProvider methods
         void QueueReference(IUnknown* object) override;
 
-        ID3D12Resource* GetABIDataInterface(void* data) const override;
+        D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const override;
 
        uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 9ecaae2c50394..5f29bae1b4fdc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -302,10 +302,10 @@ namespace Dml
                         const onnxruntime::Tensor* tensor = kernelContext->Input<onnxruntime::Tensor>(i);
 
                         uint64_t allocId;
-                        DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &inputBindings[i].Buffer, &allocId);
+                        auto bufferRegion = DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &allocId);
+
+                        inputBindings[i] = bufferRegion.GetBufferBinding();
                         inputBindingsChanged = inputBindingsChanged || (!allocId || m_inputBindingAllocIds[i] != allocId);
-                        inputBindings[i].Buffer->Release(); // Avoid holding an additional reference
-                        inputBindings[i].SizeInBytes = DmlGraphFusionHelper::AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
                         inputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &inputBindings[i]};
                         m_inputBindingAllocIds[i] = allocId;
                     }
@@ -339,10 +339,9 @@ namespace Dml
                     );
 
                 uint64_t allocId;
-                DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &outputBindings[i].Buffer, &allocId);
+                auto bufferRegion = DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &allocId);
+                outputBindings[i] = bufferRegion.GetBufferBinding();
                 outputBindingsChanged = outputBindingsChanged || (!allocId || m_outputBindingAllocIds[i] != allocId);
-                outputBindings[i].Buffer->Release(); // Avoid holding an additional reference
-                outputBindings[i].SizeInBytes = DmlGraphFusionHelper::AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
                 outputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &outputBindings[i]};
                 m_outputBindingAllocIds[i] = allocId;
             }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 55a33cc8513b5..082b193840e94 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -7,10 +7,11 @@
 #include "core/framework/execution_frame.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
+#include "DmlBufferRegion.h"
 
 #include "MLOperatorAuthorImpl.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h"
-#include "core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h"
+#include "DmlGpuAllocator.h"
 
 using namespace Microsoft::WRL;
 
@@ -100,14 +101,6 @@ namespace Windows::AI::MachineLearning::Adapter
         return strcmp(info.name, onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput || info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);
     }
 
-    // Translate the data object stored in a tensor to the type which will be returned through
-    // the ABI. The translation is determined by the provider and based on options with which the
-    // kernels are registered.
-    ID3D12Resource* TranslateAllocationDataToAbi(IWinmlExecutionProvider* winmlProvider, void* opaqueData)
-    {
-        return winmlProvider->GetABIDataInterface(opaqueData);
-    }
-
     //
     // Traits for numeric attribute types
     //
@@ -1271,19 +1264,14 @@ namespace Windows::AI::MachineLearning::Adapter
         if (impl)
         {
             m_tensorData = m_impl->MutableDataRaw();
-
-            if (isDataInterface)
-            {
-                if (m_tensorData)
-                {
-                    // Get the actual object to be returned from the ABI, which varies for internal and external
-                    // kernels (i.e. ID3D12Resource, versus something that tracks the layout).
-                    m_abiDataInterface = TranslateAllocationDataToAbi(m_winmlExecutionProvider.Get(), m_tensorData);
-                }
-            }
         }
     }
 
+    Dml::D3D12BufferRegion TensorWrapper::GetBufferRegion() const
+    {
+        return m_winmlExecutionProvider->GetBufferRegion(m_tensorData, m_impl->SizeInBytes());
+    }
+
     uint32_t STDMETHODCALLTYPE TensorWrapper::GetDimensionCount() const noexcept
     {
         if (IsClosed())
@@ -1370,8 +1358,9 @@ namespace Windows::AI::MachineLearning::Adapter
         }
         else
         {
-            m_abiDataInterface->AddRef();
-            *dataInterface = m_abiDataInterface;
+            auto bufferRegion = GetBufferRegion();
+            bufferRegion.ResourceInUavState()->AddRef();
+            *dataInterface = bufferRegion.ResourceInUavState();
         }
     }
 
@@ -1575,6 +1564,22 @@ namespace Windows::AI::MachineLearning::Adapter
         ORT_CATCH_RETURN
     }
 
+    const Dml::D3D12BufferRegion& OpKernelContextWrapper::AllocateDefaultBuffer(size_t size)
+    {
+        VerifyNotClosed();
+
+        onnxruntime::AllocatorPtr alloc;
+        THROW_IF_NOT_OK(m_impl->GetTempSpaceAllocator(&alloc));
+
+        ORT_THROW_HR_IF(E_FAIL, !IsAllocationInterface(alloc->Info()));
+        auto dml_gpu_allocator = static_cast<Dml::DmlGpuAllocator*>(alloc.get());
+        auto buffer = dml_gpu_allocator->AllocateDefaultBuffer(size);
+
+        // Ensure the allocation is freed and transitioned when the context destructs
+        m_temporaryBuffers.push_back(std::move(buffer));
+        return m_temporaryBuffers.back().Region();
+    }
+
     void STDMETHODCALLTYPE OpKernelContextWrapper::GetExecutionInterface(IUnknown** executionInterface) const noexcept
     {
         m_abiExecutionObject.CopyTo(executionInterface);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 31f7e3fbeee8b..c8a11b01defda 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -10,6 +10,8 @@
 #include <wrl/client.h>
 #include <wrl/implements.h>
 #include "core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h"
+#include "DmlBufferRegion.h"
+#include "DmlBuffer.h"
 
 interface IDMLOperator;
 
@@ -265,6 +267,8 @@ class TensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
 
     MLOperatorTensorDataType STDMETHODCALLTYPE GetTensorDataType() const noexcept override;
 
+    Dml::D3D12BufferRegion GetBufferRegion() const;
+
     bool STDMETHODCALLTYPE IsCpuData() const noexcept override;
 
     bool STDMETHODCALLTYPE IsDataInterface() const noexcept override;
@@ -455,6 +459,7 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext>, publi
 
     std::vector<IMLOperatorTensor*> GetInputTensors();
     std::vector<IMLOperatorTensor*> GetOutputTensors(const EdgeShapes& outputShapes);
+    const Dml::D3D12BufferRegion& AllocateDefaultBuffer(uint64_t size);
 
  protected:
     void ClearTempAllocations();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index e545f33fdb8d5..5aabea1eeedf5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -2,9 +2,10 @@
 
 #include "../MLOperatorAuthorImpl.h"
 #include "../../../OperatorAuthorHelper/OperatorHelper.h"
-
 #include "../External/D3DX12/d3dx12.h"
 
+#include "../DmlBufferRegion.h"
+
 // The shader header is produced using "fxc.exe dft_shader.hlsl -E DFT -T cs_5_0 -Zi /Fh"
 #include "GeneratedShaders/stockham.h"
 
@@ -76,7 +77,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         // Allocate temporary buffers if needed
         struct ResourceDesc
         {
-            ComPtr<ID3D12Resource> Resource;
+            Dml::D3D12BufferRegion BufferRegion;
             std::array<uint32_t, 4> Sizes;
             std::array<uint32_t, 4> Strides;
         };
@@ -266,7 +267,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         ComPtr<IMLOperatorTensor> outputTensor;
         ORT_THROW_IF_FAILED(context->GetOutputTensor(0, &outputTensor));
         auto outputDims = GetTensorDimensions(outputTensor.Get());
-
         ORT_THROW_HR_IF(E_FAIL, inputDims.size() != outputDims.size());
 
         // Get optional dft_length input
@@ -292,16 +292,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             params.Type = DFTType::Stockham;
             params.StockhamParams = {};
 
-            ComPtr<IUnknown> inputUnknown;
-            ComPtr<ID3D12Resource> inputResource;
-            inputTensor->GetDataInterface(inputUnknown.GetAddressOf());
-            inputUnknown.As(&inputResource);
-
-            ComPtr<IUnknown> outputUnknown;
-            ComPtr<ID3D12Resource> outputResource;
-            outputTensor->GetDataInterface(outputUnknown.GetAddressOf());
-            outputUnknown.As(&outputResource);
-
             // { before_dft_axis, axis, after_dft_axis, real_or_complex }
             std::array<uint32_t, 4> reshapedInputSize = { 1, 1, 1, inputDims.back() };
             std::array<uint32_t, 4> reshapedOutputSize = { 1, 1, 1, outputDims.back() };
@@ -349,11 +339,14 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
 
             // Create the resource loop list
             // Add the input resource to the loop list
+            auto inputTensorWrapper = static_cast<Windows::AI::MachineLearning::Adapter::TensorWrapper*>(inputTensor.Get());
             params.StockhamParams.ResourceLoopList.push_back({});
-            params.StockhamParams.ResourceLoopList.back().Resource = inputResource;
+            params.StockhamParams.ResourceLoopList.back().BufferRegion = inputTensorWrapper->GetBufferRegion();
             params.StockhamParams.ResourceLoopList.back().Sizes = reshapedInputSize;
             params.StockhamParams.ResourceLoopList.back().Strides = reshapedInputStrides;
 
+            auto kernelContext = static_cast<Windows::AI::MachineLearning::Adapter::OpKernelContextWrapper*>(context);
+
             // If 1 temporary should be placed first, or multiple temporaries, then
             // Add a temp in the list
             if (oscillateFirstTemporaryThenOutput || oscillateBetweenTwoTemporaries)
@@ -361,9 +354,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
                 params.StockhamParams.ResourceLoopList.push_back({});
                 params.StockhamParams.ResourceLoopList.back().Sizes = temporarySize;
                 params.StockhamParams.ResourceLoopList.back().Strides = temporaryStrides;
-
-                auto& resource = params.StockhamParams.ResourceLoopList.back().Resource;
-                ORT_THROW_IF_FAILED(context->AllocateTemporaryData(temporaryBufferByteSize, &resource));
+                auto& resource = params.StockhamParams.ResourceLoopList.back().BufferRegion = kernelContext->AllocateDefaultBuffer(temporaryBufferByteSize);
             }
 
             // If 2 temps, add another
@@ -372,14 +363,13 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
                 params.StockhamParams.ResourceLoopList.push_back({});
                 params.StockhamParams.ResourceLoopList.back().Sizes = temporarySize;
                 params.StockhamParams.ResourceLoopList.back().Strides = temporaryStrides;
-
-                auto& resource = params.StockhamParams.ResourceLoopList.back().Resource;
-                ORT_THROW_IF_FAILED(context->AllocateTemporaryData(temporaryBufferByteSize, &resource));
+                auto& resource = params.StockhamParams.ResourceLoopList.back().BufferRegion = kernelContext->AllocateDefaultBuffer(temporaryBufferByteSize);
             }
 
             // Add output resource
+            auto outputTensorWrapper = static_cast<Windows::AI::MachineLearning::Adapter::TensorWrapper*>(outputTensor.Get());
             params.StockhamParams.ResourceLoopList.push_back({});
-            params.StockhamParams.ResourceLoopList.back().Resource = outputResource;
+            params.StockhamParams.ResourceLoopList.back().BufferRegion = outputTensorWrapper->GetBufferRegion();
             params.StockhamParams.ResourceLoopList.back().Sizes = reshapedOutputSize;
             params.StockhamParams.ResourceLoopList.back().Strides = reshapedOutputStrides;
             params.StockhamParams.OutputIndex = static_cast<uint32_t>(params.StockhamParams.ResourceLoopList.size() - 1);
@@ -390,9 +380,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
                 params.StockhamParams.ResourceLoopList.push_back({});
                 params.StockhamParams.ResourceLoopList.back().Sizes = temporarySize;
                 params.StockhamParams.ResourceLoopList.back().Strides = temporaryStrides;
-
-                auto& resource = params.StockhamParams.ResourceLoopList.back().Resource;
-                ORT_THROW_IF_FAILED(context->AllocateTemporaryData(temporaryBufferByteSize, &resource));
+                auto& resource = params.StockhamParams.ResourceLoopList.back().BufferRegion = kernelContext->AllocateDefaultBuffer(temporaryBufferByteSize);
             }
 
             // Define the loop range
@@ -413,8 +401,8 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& loopList = stockhamParams.ResourceLoopList;
 
         // Get input and output resources
-        auto inputResource = loopList[0].Resource.Get();
-        auto outputResource = loopList[stockhamParams.OutputIndex].Resource.Get();
+        auto inputResource = loopList[0].BufferRegion.ResourceInUavState();
+        auto outputResource = loopList[stockhamParams.OutputIndex].BufferRegion.ResourceInUavState();
 
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_rootSignature.Get());
@@ -432,11 +420,11 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             auto inIdx = stockhamParams.LoopRange.CalculateIndex(index);
             auto outIdx = stockhamParams.LoopRange.CalculateIndex(index + 1);
 
-            auto in = loopList[inIdx].Resource.Get();
+            const auto& in = loopList[inIdx].BufferRegion;
             std::copy(loopList[inIdx].Sizes.begin(), loopList[inIdx].Sizes.end(), constants.InputSizes);
             std::copy(loopList[inIdx].Strides.begin(), loopList[inIdx].Strides.end(), constants.InputStrides);
 
-            auto out = loopList[outIdx].Resource.Get();
+            const auto& out = loopList[outIdx].BufferRegion;
             std::copy(loopList[outIdx].Sizes.begin(), loopList[outIdx].Sizes.end(), constants.OutputSizes);
             std::copy(loopList[outIdx].Strides.begin(), loopList[outIdx].Strides.end(), constants.OutputStrides);
 
@@ -465,24 +453,20 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
     }
 
     void Dispatch(
-        ID3D12Resource* inputResource,
-        ID3D12Resource* outputResource,
+        const Dml::D3D12BufferRegion& inputBufferRegion,
+        const Dml::D3D12BufferRegion& outputBufferRegion,
         DFTShaderConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[2];
-        uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(inputResource);
-        uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(outputResource);
-        commandList->ResourceBarrier(2, uav_barriers);
         // Set resource views
         commandList->SetComputeRootUnorderedAccessView(
             0, // root parameter index
-            inputResource->GetGPUVirtualAddress()
+            inputBufferRegion.ResourceInUavState()->GetGPUVirtualAddress() + inputBufferRegion.Offset()
         );
 
         commandList->SetComputeRootUnorderedAccessView(
             1, // root parameter index
-            outputResource->GetGPUVirtualAddress()
+            outputBufferRegion.ResourceInUavState()->GetGPUVirtualAddress() + outputBufferRegion.Offset()
         );
         auto pendingElementCount = constants.ElementCount;
 
@@ -512,8 +496,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
 
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
-
-        commandList->ResourceBarrier(2, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h
deleted file mode 100644
index bff622726219a..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/dml_buffer_region.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) Microsoft Corporation.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-
-#include "dml_common.h"
-#include "tfdml/runtime_adapter/macros.h"
-
-namespace tfdml
-{
-
-class D3D12HeapAllocator;
-
-// Represents a region of a D3D12 buffer resource. A buffer region has an
-// underlying ID3D12Resource* (of D3D12_RESOURCE_DIMENSION_BUFFER), an offset in
-// bytes from the beginning of that buffer, and a size in bytes of the region.
-class D3D12BufferRegion
-{
-  public:
-    D3D12BufferRegion() = default;
-
-    // References a region of a buffer. The respective ID3D12Resource objects
-    // must be in the appropriate states. Each resource is optional, but if more
-    // than one are provided they must map to the same region of memory.
-    D3D12BufferRegion(
-        uint64_t offset,
-        uint64_t size_in_bytes,
-        ID3D12Resource* resource_uav_state,
-        ID3D12Resource* resource_copy_src_state,
-        ID3D12Resource* resource_copy_dst_state);
-
-    // Move-only
-    D3D12BufferRegion(const D3D12BufferRegion&) = delete;
-    D3D12BufferRegion& operator=(const D3D12BufferRegion&) = delete;
-    D3D12BufferRegion(D3D12BufferRegion&&);
-    D3D12BufferRegion& operator=(D3D12BufferRegion&&);
-
-    ID3D12Resource* ResourceInUavState() const;
-
-    // NOTE: may be any state that is valid as a copy source (COPY_SRC,
-    // GENERIC_READ, or COMMON).
-    ID3D12Resource* ResourceInCopySrcState() const;
-
-    ID3D12Resource* ResourceInCopyDstState() const;
-
-    uint64_t Offset() const;
-    uint64_t SizeInBytes() const;
-
-    DML_BUFFER_BINDING GetBufferBinding() const;
-
-    explicit operator bool() const { return first_valid_resource_ != nullptr; }
-
-    // Creates a subregion at an offset from the start of this region. If no
-    // size is provided the region runs to the end of the current region.
-    inline D3D12BufferRegion Subregion(
-        uint64_t offset,
-        uint64_t size_in_bytes = 0) const
-    {
-        // start of subregion must be within current region
-        CHECK(offset < size_in_bytes_);
-        size_in_bytes =
-            size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
-        // end of subregion must be within current region
-        CHECK(size_in_bytes <= size_in_bytes_ - offset);
-
-        return D3D12BufferRegion(
-            offset_ + offset,
-            size_in_bytes,
-            resource_uav_state_,
-            resource_copy_src_state_,
-            resource_copy_dst_state_);
-    }
-
-  private:
-    ID3D12Resource* resource_uav_state_ = nullptr;
-    ID3D12Resource* resource_copy_src_state_ = nullptr;
-    ID3D12Resource* resource_copy_dst_state_ = nullptr;
-    uint64_t offset_ = 0;
-    uint64_t size_in_bytes_ = 0;
-
-    // Pointer to the first resource above that isn't null.
-    ID3D12Resource* first_valid_resource_ = nullptr;
-};
-
-} // namespace tfdml

From 06caff8a2a9767b4a1e3130c35a80982c8425ce4 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 27 Jan 2023 16:13:28 -0800
Subject: [PATCH 19/76] WIP

---
 .../dml/DmlExecutionProvider/src/Operators/DmlDFT.h        | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 5aabea1eeedf5..403e660b0e08c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -458,6 +458,11 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         DFTShaderConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[2];
+        uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(inputBufferRegion.ResourceInUavState());
+        uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(outputBufferRegion.ResourceInUavState());
+        commandList->ResourceBarrier(2, uav_barriers);
+
         // Set resource views
         commandList->SetComputeRootUnorderedAccessView(
             0, // root parameter index
@@ -496,6 +501,8 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
 
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
+
+        commandList->ResourceBarrier(2, uav_barriers);
     }
 };
 

From e7667f1852c210dd04060780bc62f46b10181fa9 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 27 Jan 2023 16:31:33 -0800
Subject: [PATCH 20/76] WIP

---
 .../dml/DmlExecutionProvider/src/Operators/DmlDFT.h        | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 403e660b0e08c..aead38c872e45 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -458,10 +458,11 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         DFTShaderConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[2];
+        D3D12_RESOURCE_BARRIER uav_barriers[3];
         uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(inputBufferRegion.ResourceInUavState());
         uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(outputBufferRegion.ResourceInUavState());
-        commandList->ResourceBarrier(2, uav_barriers);
+        uav_barriers[2] = CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr);
+        commandList->ResourceBarrier(3, uav_barriers);
 
         // Set resource views
         commandList->SetComputeRootUnorderedAccessView(
@@ -502,7 +503,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(2, uav_barriers);
+        commandList->ResourceBarrier(3, uav_barriers);
     }
 };
 

From a95d434117af203bb41abdcf1b9f4c64b989f834 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 27 Jan 2023 23:44:28 -0800
Subject: [PATCH 21/76] WIP

---
 .../onnxruntime/core/framework/ortdevice.h    |  1 +
 onnxruntime/core/framework/allocator.cc       |  2 +-
 .../inc/DmlExecutionProvider.h                |  2 +-
 .../inc/IWinmlExecutionProvider.h             |  9 +++++++--
 .../src/BucketizedBufferAllocator.cpp         | 18 +++++-------------
 .../src/BucketizedBufferAllocator.h           |  5 +++--
 .../src/DmlBfcAllocator.h                     |  4 ++--
 .../DmlExecutionProvider/src/DmlBuffer.cpp    |  3 ++-
 .../src/DmlGpuAllocator.cpp                   |  9 +++++----
 .../src/DmlGpuAllocator.h                     |  5 +++--
 .../src/DmlGraphFusionHelper.cpp              | 16 ++++++++++++++--
 .../src/ExecutionProvider.cpp                 | 19 +++++++++----------
 .../src/ExecutionProvider.h                   |  5 +++--
 .../src/MLOperatorAuthorImpl.cpp              | 11 ++++++++++-
 .../providers/dml/dml_provider_factory.cc     |  5 +++--
 15 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
index 77f7c3e1743f0..962445182e6ee 100644
--- a/include/onnxruntime/core/framework/ortdevice.h
+++ b/include/onnxruntime/core/framework/ortdevice.h
@@ -23,6 +23,7 @@ struct OrtDevice {
     static const MemoryType CUDA_PINNED = 1;
     static const MemoryType HIP_PINNED = 2;
     static const MemoryType CANN_PINNED = 3;
+    static const MemoryType DML_EXTERNAL = 4;
   };
 
   constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index a0d20974624be..08e1221cb7977 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -152,7 +152,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         id1, mem_type1);
   } else if (strcmp(name1, onnxruntime::DML) == 0) {
     *out = new OrtMemoryInfo(
-        onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
+        onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, static_cast<OrtDevice::DeviceId>(id1)),
         id1, mem_type1);
   } else {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported.");
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index fe07ccf08899e..c062ff81d1330 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -30,7 +30,7 @@ namespace Dml
         ID3D12CommandQueue* commandQueue,
         bool enableMetacommands = true);
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr);
+    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 198d38c348c87..a3acff6b2f4ae 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -23,6 +23,11 @@ namespace onnxruntime
     class Node;
 }
 
+namespace Dml
+{
+    struct TaggedPointer;
+}
+
 namespace Windows::AI::MachineLearning::Adapter
 {
     interface __declspec(uuid("5b19a18a-5ed5-4df2-a363-21b89380a698"))
@@ -35,10 +40,10 @@ namespace Windows::AI::MachineLearning::Adapter
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
 
-        virtual Dml::D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const = 0;
+        virtual Dml::D3D12BufferRegion GetBufferRegion(const Dml::TaggedPointer& taggedPointer, uint64_t size) const = 0;
 
         virtual uint64_t TryGetPooledAllocationId(
-            void* data,
+            const Dml::TaggedPointer& taggedPointer,
             bool isInternalOperator) = 0;
 
         virtual void GetABIExecutionInterfaceAndInvalidateState(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index b0ddd29e6bf46..e2ae8f23a3744 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -380,18 +380,14 @@ namespace Dml
     }
 
     D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(
-        const void* ptr,
+        const TaggedPointer& taggedPointer,
         uint64_t size_in_bytes)
     {
-        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
-
-        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
-
         // We need to access (mutable) state after this point, so we need to lock
         std::unique_lock<std::mutex> lock(mutex_);
 
         // Find the allocation corresponding to this pointer
-        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+        auto it = allocations_by_id_.find(taggedPointer.allocation_id);
         ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
 
         // Make sure that we are aligned to 4 bytes to satisfy DML's requirements
@@ -400,24 +396,20 @@ namespace Dml
             (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
 
         return D3D12BufferRegion(
-            tagged_ptr.offset,
+            taggedPointer.offset,
             size_in_bytes,
             it->second->GetUavResource(),
             it->second->GetCopySrcResource(),
             it->second->GetCopyDstResource());
     }
 
-    AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const void* ptr)
+    AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
     {
-        ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
-
-        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
-
         // We need to access (mutable) state after this point, so we need to lock
         std::unique_lock<std::mutex> lock(mutex_);
 
         // Find the allocation corresponding to this pointer
-        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
+        auto it = allocations_by_id_.find(taggedPointer.allocation_id);
         return it->second.Get();
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index f2c09dfa0cfc4..370a7a6ff1e8d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -11,6 +11,7 @@ namespace Dml
 {
     class BucketizedBufferAllocator;
     class BucketizedBufferAllocator;
+    struct TaggedPointer;
 
     // An allocator that makes logically contiguous allocations backed by D3D heaps.
     //
@@ -60,10 +61,10 @@ namespace Dml
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
         D3D12BufferRegion CreateBufferRegion(
-            const void* ptr,
+            const TaggedPointer& taggedPointer,
             uint64_t size_in_bytes);
 
-        AllocationInfo* GetAllocationInfo(const void* ptr);
+        AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
         void* Alloc(size_t size_in_bytes);
         void Free(void* ptr);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
index f43aa769af0a9..c00b820434592 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
@@ -21,8 +21,8 @@ namespace Dml
         ),
         m_subAllocator(std::move(subAllocator)) {}
 
-        void* Alloc(size_t size_in_bytes) { return m_subAllocator->Alloc(size_in_bytes); }
-        void Free(void* ptr) { m_subAllocator->Free(ptr); }
+        void* Alloc(size_t size_in_bytes) final { return m_subAllocator->Alloc(size_in_bytes); }
+        void Free(void* ptr) final { m_subAllocator->Free(ptr); }
     private:
         std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
     };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index 6f587261553e6..c5fa576d24a0f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -4,6 +4,7 @@
 #include "precomp.h"
 #include "DmlBuffer.h"
 #include "DmlGpuAllocator.h"
+#include "DmlTaggedPointer.h"
 
 namespace Dml
 {
@@ -14,7 +15,7 @@ namespace Dml
     m_opaqueData = allocator_->Alloc(size_in_bytes);
     ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr);
 
-    buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes);
+    buffer_region_ = allocator_->CreateBufferRegion(TaggedPointer::Unpack(m_opaqueData), size_in_bytes);
 }
 
 DmlBuffer::~DmlBuffer()
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 13e0d8dfe96f7..5370515afffd1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -7,6 +7,7 @@
 #include "DmlGpuAllocator.h"
 #include "core/framework/allocator.h"
 #include "BucketizedBufferAllocator.h"
+#include "DmlTaggedPointer.h"
 
 namespace Dml
 {
@@ -31,14 +32,14 @@ namespace Dml
         m_bfcAllocator->Free(ptr);
     }
 
-    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const void* ptr, uint64_t size_in_bytes)
+    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes)
     {
-        return m_subAllocator->CreateBufferRegion(ptr, size_in_bytes);
+        return m_subAllocator->CreateBufferRegion(taggedPointer, size_in_bytes);
     }
 
-    AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const void* ptr)
+    AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
     {
-        return m_subAllocator->GetAllocationInfo(ptr);
+        return m_subAllocator->GetAllocationInfo(taggedPointer);
     }
 
     void DmlGpuAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index 5ef9ea855753f..3bc8127598460 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -11,6 +11,7 @@ namespace Dml
 {
     class BucketizedBufferAllocator;
     class AllocationInfo;
+    struct TaggedPointer;
 
     class DmlGpuAllocator : public onnxruntime::IAllocator
     {
@@ -19,8 +20,8 @@ namespace Dml
 
         void* Alloc(size_t size_in_bytes) final;
         void Free(void* ptr) final;
-        D3D12BufferRegion CreateBufferRegion(const void* ptr, uint64_t size_in_bytes);
-        AllocationInfo* GetAllocationInfo(const void* ptr);
+        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
+        AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
         DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 58dd7314b929f..52e0f287e6594 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -2,6 +2,8 @@
 
 #include "DmlGraphFusionHelper.h"
 #include "DmlBufferRegion.h"
+#include "DmlTaggedPointer.h"
+#include "DmlAllocationInfo.h"
 
 
 namespace Dml
@@ -95,8 +97,18 @@ namespace DmlGraphFusionHelper
         uint64_t* allocId)
     {
         void* opaqueData = const_cast<void*>(tensor->DataRaw());
-        *allocId = winmlProvider->TryGetPooledAllocationId(opaqueData, 0);
-        return winmlProvider->GetBufferRegion(opaqueData, tensor->SizeInBytes());
+
+        if (tensor->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
+        {
+            // The allocation is not pooled
+            auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
+            *allocId = allocInfo->GetPooledResourceId();
+            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+        }
+
+        auto taggedPointer = TaggedPointer::Unpack(opaqueData);
+        *allocId = winmlProvider->TryGetPooledAllocationId(taggedPointer, 0);
+        return winmlProvider->GetBufferRegion(taggedPointer, tensor->SizeInBytes());
     }
 
     void ProcessInputData(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 0f4fe3788cbd0..0a6859f7b615f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -23,6 +23,7 @@
 #include "DmlBfcAllocator.h"
 #include "DmlGpuAllocator.h"
 #include "DmlBuffer.h"
+#include "DmlTaggedPointer.h"
 
 #ifdef ERROR
 #undef ERROR
@@ -120,10 +121,8 @@ namespace Dml
 
     D3D12BufferRegion ExecutionProviderImpl::GetBufferForTensor(IMLOperatorTensor* tensor) const
     {
-        MLOperatorTensor mlOperatorTensor(tensor);
-        void* data = mlOperatorTensor.GetByteData();
-        auto sizeInBytes = mlOperatorTensor.GetUnalignedTensorByteSize();
-        return m_gpuAllocator->CreateBufferRegion(data, sizeInBytes);
+        auto tensorWrapper = static_cast<TensorWrapper*>(tensor);
+        return tensorWrapper->GetBufferRegion();
     }
 
     ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept
@@ -850,15 +849,15 @@ namespace Dml
         m_context->QueueReference(object);
     }
 
-    D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(void* data, uint64_t size) const
+    D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const
     {
-        return m_gpuAllocator->CreateBufferRegion(data, size);
+        return m_gpuAllocator->CreateBufferRegion(taggedPointer, size);
     }
 
-    uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(void* data, bool isInternalOperator)
+    uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator)
     {
         assert(!isInternalOperator);
-        return m_gpuAllocator->GetAllocationInfo(data)->GetPooledResourceId();
+        return m_gpuAllocator->GetAllocationInfo(taggedPointer)->GetPooledResourceId();
     }
 
     void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState(
@@ -981,10 +980,10 @@ namespace Dml
         return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
     }
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr)
+    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer)
     {
         Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
-        return pAllocationInfo->GetAllocationInfo(ptr)->GetUavResource();
+        return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource();
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 5ffced302d5c5..b9787335b6ea0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -28,6 +28,7 @@ namespace Dml
     class DmlCpuAllocator;
     class ExecutionProvider;
     class DmlGpuAllocator;
+    struct TaggedPointer;
 
     class ExecutionProviderImpl : public WRL::Base<Dml::IExecutionProvider,
                                   Windows::AI::MachineLearning::Adapter::IWinmlExecutionProvider>
@@ -97,9 +98,9 @@ namespace Dml
         // IWinmlExecutionProvider methods
         void QueueReference(IUnknown* object) override;
 
-        D3D12BufferRegion GetBufferRegion(void* data, uint64_t size) const override;
+        D3D12BufferRegion GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const override;
 
-       uint64_t TryGetPooledAllocationId(void* data, bool isInternalOperator) override;
+        uint64_t TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) override;
 
         void GetABIExecutionInterfaceAndInvalidateState(
             bool isInternalOperator,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 082b193840e94..d0d5d3a8b403e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -12,6 +12,8 @@
 #include "MLOperatorAuthorImpl.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h"
 #include "DmlGpuAllocator.h"
+#include "DmlAllocationInfo.h"
+#include "DmlTaggedPointer.h"
 
 using namespace Microsoft::WRL;
 
@@ -1269,7 +1271,14 @@ namespace Windows::AI::MachineLearning::Adapter
 
     Dml::D3D12BufferRegion TensorWrapper::GetBufferRegion() const
     {
-        return m_winmlExecutionProvider->GetBufferRegion(m_tensorData, m_impl->SizeInBytes());
+        if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
+        {
+            auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
+            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+        }
+
+        auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
+        return m_winmlExecutionProvider->GetBufferRegion(taggedPointer, m_impl->SizeInBytes());
     }
 
     uint32_t STDMETHODCALLTYPE TensorWrapper::GetDimensionCount() const noexcept
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 9f589a0d3ad41..ebcbae5e799de 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -20,6 +20,7 @@ using Microsoft::WRL::ComPtr;
 #include "core/framework/error_code_helper.h"
 #include "DmlExecutionProvider/src/ErrorHandling.h"
 #include "DmlExecutionProvider/src/GraphicsUnknownHelper.h"
+#include "DmlExecutionProvider/src/DmlTaggedPointer.h"
 #include "DmlExecutionProvider/inc/DmlExecutionProvider.h"
 #include "core/platform/env.h"
 
@@ -100,7 +101,7 @@ bool IsSoftwareAdapter(IDXGIAdapter1* adapter) {
     auto isBasicRenderDriverVendorId = desc.VendorId == 0x1414;
     auto isBasicRenderDriverDeviceId = desc.DeviceId == 0x8c;
     auto isSoftwareAdapter = desc.Flags == DXGI_ADAPTER_FLAG_SOFTWARE;
-    
+
     return isSoftwareAdapter || (isBasicRenderDriverVendorId && isBasicRenderDriverDeviceId);
 }
 
@@ -217,7 +218,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc
   if (!allocator) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
   }
-  *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), allocation);
+  *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation));
   (*d3d_resource)->AddRef();
 #else
   *d3d_resource = nullptr;

From 0729ea294f289a03bf8cefa0ae3bf0532ea766c6 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 30 Jan 2023 14:17:19 -0800
Subject: [PATCH 22/76] Fix

---
 winml/lib/Api.Ort/OnnxruntimeEngine.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
index ee45ceac9493a..905a3e6866f02 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
@@ -632,11 +632,6 @@ HRESULT OnnxruntimeEngine::CreateTensorValueFromExternalD3DResource(ID3D12Resour
   RETURN_HR_IF_NOT_OK_MSG(ort_api->CreateMemoryInfo("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info),
                           ort_api);
 
-  OrtAllocator* ort_allocator;
-  RETURN_HR_IF_NOT_OK_MSG(ort_api->CreateAllocator(session_.get(), ort_memory_info, &ort_allocator),
-                          ort_api);
-  auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator);
-
   void* dml_allocator_resource;
   RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->CreateGPUAllocationFromD3DResource(d3d_resource, &dml_allocator_resource),
                           engine_factory_->UseOrtApi());

From 544637f29588e8f83d2015cef57888b167d50a63 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 30 Jan 2023 15:16:45 -0800
Subject: [PATCH 23/76] Fix

---
 .../src/FusedGraphKernel.cpp                  | 39 +++++++++++--------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 5f29bae1b4fdc..9dfc1672708c0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -109,7 +109,7 @@ namespace Dml
                 // Get input resources for execution, excluding those which were specified as owned by DML and provided
                 // at initialization instead.
                 std::vector<ComPtr<IMLOperatorTensor>> inputTensors(kernelContext->InputCount());
-                std::vector<ID3D12Resource*> inputPtrs(kernelContext->InputCount());
+                std::vector<D3D12BufferRegion> inputBufferRegions(kernelContext->InputCount());
 
                 for (int i = 0; i < kernelContext->InputCount(); ++i)
                 {
@@ -120,12 +120,18 @@ namespace Dml
 
                     if (m_nonOwnedGraphInputsFromInitializers[i])
                     {
-                        inputPtrs[i] = m_nonOwnedGraphInputsFromInitializers[i].Get();
+                        inputBufferRegions[i] = D3D12BufferRegion(
+                            0,
+                            m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width,
+                            m_nonOwnedGraphInputsFromInitializers[i].Get(),
+                            nullptr,
+                            nullptr);
                     }
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
                         ORT_THROW_IF_FAILED(contextWrapper.GetInputTensor(i, inputTensors[i].GetAddressOf()));
-                        inputPtrs[i] = m_provider->DecodeResource(inputTensors[i].Get());
+                        auto tensorWrapper = static_cast<TensorWrapper*>(inputTensors[i].Get());
+                        inputBufferRegions[i] = tensorWrapper->GetBufferRegion();
                     }
                 }
 
@@ -133,7 +139,7 @@ namespace Dml
                 ExecuteOperator(
                     m_compiledExecutionPlanOperator.Get(),
                     m_persistentResourceBinding ? &*m_persistentResourceBinding : nullptr,
-                    inputPtrs,
+                    inputBufferRegions,
                     aux);
 
                 ORT_THROW_IF_FAILED(m_provider->AddUAVBarrier());
@@ -153,7 +159,7 @@ namespace Dml
         void ExecuteOperator(
             IDMLCompiledOperator* op,
             _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding,
-            gsl::span<ID3D12Resource*> inputTensors,
+            gsl::span<D3D12BufferRegion> inputBufferRegions,
             gsl::span<IMLOperatorTensor*> outputTensors) const
         {
             auto FillBindingsFromTensors = [this](auto& bufferBindings, auto& bindingDescs,  gsl::span<IMLOperatorTensor*>& tensors)
@@ -162,10 +168,10 @@ namespace Dml
                 {
                     if (tensor)
                     {
+                        auto tensorWrapper = static_cast<TensorWrapper*>(tensor);
+
                         assert(tensor->IsDataInterface());
-                        ID3D12Resource* resource = m_provider->DecodeResource(tensor);
-                        D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
-                        bufferBindings.push_back({ resource, 0, resourceDesc.Width });
+                        bufferBindings.push_back(tensorWrapper->GetBufferRegion().GetBufferBinding());
                         bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
                     }
                     else
@@ -176,29 +182,28 @@ namespace Dml
                 }
             };
 
-            auto FillBindingsFromBuffers = [](auto& bufferBindings, auto& bindingDescs,  gsl::span<ID3D12Resource*>& resources)
+            auto FillBindingsFromBufferRegions = [](auto& bufferBindings, auto& bindingDescs,  gsl::span<D3D12BufferRegion>& bufferRegions)
             {
-                for (ID3D12Resource* resource : resources)
+                for (const D3D12BufferRegion& bufferRegion : bufferRegions)
                 {
-                    if (resource)
+                    bufferBindings.push_back(bufferRegion.GetBufferBinding());
+
+                    if (bufferRegion.ResourceInUavState() != nullptr)
                     {
-                        D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
-                        bufferBindings.push_back({ resource, 0, resourceDesc.Width });
                         bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
                     }
                     else
                     {
-                        bufferBindings.push_back({ nullptr, 0, 0 });
                         bindingDescs.push_back({ DML_BINDING_TYPE_NONE, nullptr });
                     }
                 }
             };
 
             std::vector<DML_BUFFER_BINDING> inputBufferBindings;
-            inputBufferBindings.reserve(inputTensors.size());
+            inputBufferBindings.reserve(inputBufferRegions.size());
             std::vector<DML_BINDING_DESC> inputBindings;
-            inputBindings.reserve(inputTensors.size());
-            FillBindingsFromBuffers(inputBufferBindings, inputBindings, inputTensors);
+            inputBindings.reserve(inputBufferRegions.size());
+            FillBindingsFromBufferRegions(inputBufferBindings, inputBindings, inputBufferRegions);
 
             std::vector<DML_BUFFER_BINDING> outputBufferBindings;
             outputBufferBindings.reserve(outputTensors.size());

From ea268552105499fed0c954acb9d3cad45d2f0289 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 31 Jan 2023 10:12:31 -0800
Subject: [PATCH 24/76] WIP

---
 .../src/Operators/DmlOperatorNonZero.cpp                  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
index 62374963fff1b..61623dfe2b4dd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
@@ -65,10 +65,6 @@ class DmlOperatorNonZero: public DmlOperator
             nonzeroCoordinatesDesc.OutputCountTensor = &intermediateDescs[0];
             nonzeroCoordinatesDesc.OutputCoordinatesTensor = &intermediateDescs[1];
 
-            // TODO: Remove this hack when DML supports native int64 for NonZero
-            // We use the int64/uint32 stride hack here, so zero out the data before writing to it
-            m_zeroOperator = InitializeZeroInt64Tensor(m_intermediateTensorDescs[1].GetBufferSizeInBytes());
-
             DML_OPERATOR_DESC opDesc = { DML_OPERATOR_NONZERO_COORDINATES, &nonzeroCoordinatesDesc };
             SetDmlOperatorDesc(opDesc, kernelCreationContext);
         }
@@ -126,7 +122,11 @@ class DmlOperatorNonZero: public DmlOperator
 
         if (!m_emptyInput && nonzeroElementCount > 0)
         {
+            std::vector<DimensionType> outputCoordinatesStrides = {nonzeroElementCount * 2, 2};
+            TensorDesc stridedOutputTensorDesc(DML_TENSOR_DATA_TYPE_UINT32, outputSizes, outputCoordinatesStrides);
+
             // TODO: Remove this hack when DML supports native int64 for NonZero
+            m_zeroOperator = InitializeZeroInt64Tensor(stridedOutputTensorDesc.GetBufferSizeInBytes());
             ExecuteZeroInt64Tensor(m_zeroOperator.Get(), outputTensor.GetInterface().Get());
 
             ComPtr<IDMLCompiledOperator> sliceOperator = InitializeSlice(m_intermediateTensorDescs[1], nonzeroElementCount);

From 61dce2e96d9dfb05d0db70795ea4ea86c06ab313 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 31 Jan 2023 14:18:13 -0800
Subject: [PATCH 25/76] WIP

---
 .../core/providers/dml/dml_provider_factory.h | 16 ++++-----
 onnxruntime/core/framework/allocator.cc       |  4 ++-
 .../inc/DmlExecutionProvider.h                |  1 -
 .../src/DmlBfcAllocator.h                     |  2 +-
 .../src/DmlExternalGpuAllocator.cpp           | 33 +++++++++++++++++++
 .../src/DmlExternalGpuAllocator.h             | 22 +++++++++++++
 .../src/DmlGpuAllocator.cpp                   |  5 +--
 .../src/ExecutionProvider.cpp                 |  8 ++---
 .../providers/dml/dml_provider_factory.cc     |  5 ++-
 winml/lib/Api.Ort/OnnxruntimeEngine.cpp       |  3 +-
 winml/test/common/SqueezeNetValidator.cpp     |  4 +--
 11 files changed, 79 insertions(+), 24 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h

diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
index 6af7dac956560..a8f460b6d54d5 100644
--- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h
+++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
@@ -36,8 +36,8 @@ extern "C" {
  * The OrtSessionOptionsAppendExecutionProvider_DML export on the OrtDmlApi should be used instead.
  *
  * Creates a DirectML Execution Provider which executes on the hardware adapter with the given device_id, also known as
- * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by 
- * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the 
+ * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by
+ * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the
  * primary display GPU installed on the system. A negative device_id is invalid.
  */
 ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOptions* options, int device_id);
@@ -49,8 +49,8 @@ ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOpti
  *
  * Creates a DirectML Execution Provider using the given DirectML device, and which executes work on the supplied D3D12
  * command queue. The DirectML device and D3D12 command queue must have the same parent ID3D12Device, or an error will
- * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this 
- * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue 
+ * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this
+ * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue
  * objects.
  * See also: DMLCreateDevice
  * See also: ID3D12Device::CreateCommandQueue
@@ -65,8 +65,8 @@ typedef struct OrtDmlApi OrtDmlApi;
 struct OrtDmlApi {
   /**
    * Creates a DirectML Execution Provider which executes on the hardware adapter with the given device_id, also known as
-   * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by 
-   * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the 
+   * the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by
+   * IDXGIFactory::EnumAdapters. A device_id of 0 always corresponds to the default adapter, which is typically the
    * primary display GPU installed on the system. A negative device_id is invalid.
   */
   ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOptions* options, int device_id);
@@ -74,8 +74,8 @@ struct OrtDmlApi {
   /**
    * Creates a DirectML Execution Provider using the given DirectML device, and which executes work on the supplied D3D12
    * command queue. The DirectML device and D3D12 command queue must have the same parent ID3D12Device, or an error will
-   * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this 
-   * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue 
+   * be returned. The D3D12 command queue must be of type DIRECT or COMPUTE (see D3D12_COMMAND_LIST_TYPE). If this
+   * function succeeds, the inference session maintains a strong reference on both the dml_device and the command_queue
    * objects.
    * See also: DMLCreateDevice
    * See also: ID3D12Device::CreateCommandQueue
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index 08e1221cb7977..7613bb456f6f9 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -151,9 +151,11 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         onnxruntime::OpenVINO_GPU, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
         id1, mem_type1);
   } else if (strcmp(name1, onnxruntime::DML) == 0) {
+    // Since EPs cannot have 2 allocators with the same OrtMemType and Memory ID,
+    // we use -1 as the memory ID to represent external allocations that don't have any allocator.
     *out = new OrtMemoryInfo(
         onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, static_cast<OrtDevice::DeviceId>(id1)),
-        id1, mem_type1);
+        -1, mem_type1);
   } else {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported.");
   }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index c062ff81d1330..e31f59681b63f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -30,7 +30,6 @@ namespace Dml
         ID3D12CommandQueue* commandQueue,
         bool enableMetacommands = true);
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
index c00b820434592..17ba37146bdc5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
@@ -14,7 +14,7 @@ namespace Dml
         DmlBfcAllocator(std::shared_ptr<BucketizedBufferAllocator> subAllocator)
         : onnxruntime::IAllocator(
             OrtMemoryInfo(
-                "DML",
+                onnxruntime::DML,
                 OrtAllocatorType::OrtDeviceAllocator,
                 OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
             )
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
new file mode 100644
index 0000000000000..0ebe2c3d00e5e
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "precomp.h"
+#include "DmlExternalGpuAllocator.h"
+
+namespace Dml
+{
+    DmlExternalGpuAllocator::DmlExternalGpuAllocator()
+    : onnxruntime::IAllocator(
+        OrtMemoryInfo(
+            onnxruntime::DML,
+            OrtAllocatorType::OrtDeviceAllocator,
+            OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, 0),
+            -1
+        )
+    ) {}
+
+    void* DmlExternalGpuAllocator::Alloc(size_t size_in_bytes)
+    {
+        // This allocator should never be used to allocate memory; it should only be use to decode the opaque data pointer
+        THROW_HR(E_INVALIDARG);
+    }
+
+    void DmlExternalGpuAllocator::Free(void* ptr)
+    {
+        // This allocator should never be used to free memory; it should only be use to decode the opaque data pointer
+        THROW_HR(E_INVALIDARG);
+    }
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
new file mode 100644
index 0000000000000..6c5ee8cd29c6e
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+
+namespace Dml
+{
+    class BucketizedBufferAllocator;
+    class AllocationInfo;
+    struct TaggedPointer;
+
+    class DmlExternalGpuAllocator : public onnxruntime::IAllocator
+    {
+    public:
+        DmlExternalGpuAllocator();
+
+        void* Alloc(size_t size_in_bytes) final;
+        void Free(void* ptr) final;
+    };
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 5370515afffd1..5bee9ee34ec4d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -14,9 +14,10 @@ namespace Dml
     DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator)
     : onnxruntime::IAllocator(
         OrtMemoryInfo(
-            "DML",
+            onnxruntime::DML,
             OrtAllocatorType::OrtDeviceAllocator,
-            OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
+            OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0),
+            0
         )
     ),
     m_bfcAllocator(bfcAllocator),
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 0a6859f7b615f..3c0874ba7e528 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -24,6 +24,7 @@
 #include "DmlGpuAllocator.h"
 #include "DmlBuffer.h"
 #include "DmlTaggedPointer.h"
+#include "DmlExternalGpuAllocator.h"
 
 #ifdef ERROR
 #undef ERROR
@@ -89,6 +90,7 @@ namespace Dml
         InsertAllocator(m_impl->GetGpuAllocator());
         InsertAllocator(m_impl->GetCpuInputAllocator());
         InsertAllocator(m_impl->GetCpuOutputAllocator());
+        InsertAllocator(std::make_shared<DmlExternalGpuAllocator>());
     }
 
     std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
@@ -980,12 +982,6 @@ namespace Dml
         return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
     }
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer)
-    {
-        Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
-        return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource();
-    }
-
     void FlushContext(onnxruntime::IExecutionProvider* provider)
     {
         ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index ebcbae5e799de..784e0101197d2 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -20,6 +20,7 @@ using Microsoft::WRL::ComPtr;
 #include "core/framework/error_code_helper.h"
 #include "DmlExecutionProvider/src/ErrorHandling.h"
 #include "DmlExecutionProvider/src/GraphicsUnknownHelper.h"
+#include "DmlExecutionProvider/src/DmlAllocationInfo.h"
 #include "DmlExecutionProvider/src/DmlTaggedPointer.h"
 #include "DmlExecutionProvider/inc/DmlExecutionProvider.h"
 #include "core/platform/env.h"
@@ -218,8 +219,10 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc
   if (!allocator) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
   }
-  *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation));
+
+  *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
   (*d3d_resource)->AddRef();
+
 #else
   *d3d_resource = nullptr;
 #endif  // USE_DML
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
index 905a3e6866f02..e294c91afc079 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
@@ -188,8 +188,7 @@ HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) {
     auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator);
 
     winrt::com_ptr<ID3D12Resource> resource;
-    RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data,
-                                                                                 resource.put()),
+    RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, resource.put()),
                             ort_api);
     out = _winml::Resource(resource.get(), [](void*) { /*do nothing, as this pointer is actually a com pointer! */ });
   } else {
diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp
index 3c85ef2c351dc..182e1dc806432 100644
--- a/winml/test/common/SqueezeNetValidator.cpp
+++ b/winml/test/common/SqueezeNetValidator.cpp
@@ -211,11 +211,11 @@ void ModelValidator::SqueezeNet(
     auto modulePath = FileHelpers::GetModulePath();
     auto fullModelPath = modulePath + modelFileName;
     auto outputFileName = modulePath + outputDataFileName;
-    
+
     // WinML model creation
     LearningModel model = nullptr;
     model = LearningModel::LoadFromFilePath(fullModelPath);
-    
+
     LearningModelSession modelSession = nullptr;
     modelSession = LearningModelSession(model, LearningModelDevice(deviceKind));
 

From b9b3fb8e2836266e1d82d7d9dd0c5f39fda0ab5a Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 31 Jan 2023 14:34:44 -0800
Subject: [PATCH 26/76] WIP

---
 .../dml/DmlExecutionProvider/inc/DmlExecutionProvider.h   | 1 +
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp    | 6 ++++++
 onnxruntime/core/providers/dml/dml_provider_factory.cc    | 8 +++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index e31f59681b63f..c062ff81d1330 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -30,6 +30,7 @@ namespace Dml
         ID3D12CommandQueue* commandQueue,
         bool enableMetacommands = true);
 
+    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 3c0874ba7e528..d5c74e8499f89 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -982,6 +982,12 @@ namespace Dml
         return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
     }
 
+    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer)
+    {
+        Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
+        return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource();
+    }
+
     void FlushContext(onnxruntime::IExecutionProvider* provider)
     {
         ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 784e0101197d2..9a3cc3f739356 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -220,7 +220,13 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
   }
 
-  *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
+  if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) {
+    *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
+  } else {
+    ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT);
+    *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation));
+  }
+
   (*d3d_resource)->AddRef();
 
 #else

From 385480786825745c06f7e72ba0d481139e5893b1 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 31 Jan 2023 20:42:02 -0800
Subject: [PATCH 27/76] WIP

---
 .../core/providers/dml/dml_provider_factory.h | 12 ++++++
 .../inc/DmlExecutionProvider.h                |  3 +-
 .../src/ExecutionProvider.cpp                 |  4 +-
 .../providers/dml/dml_provider_factory.cc     | 37 ++++++++++++++++++-
 winml/adapter/winml_adapter_dml.cpp           |  2 +-
 .../Api.Image/TensorToVideoFrameConverter.cpp | 29 +++++++++------
 .../Api.Image/VideoFrameToTensorConverter.cpp | 19 +++++-----
 .../inc/TensorToVideoFrameConverter.h         |  7 +++-
 .../inc/VideoFrameToTensorConverter.h         |  5 ++-
 winml/lib/Api.Ort/OnnxruntimeEngine.cpp       |  9 +++--
 winml/lib/Api.Ort/OnnxruntimeEngine.h         |  2 +-
 winml/lib/Api/ImageFeatureValue.cpp           | 34 ++++++++++-------
 winml/lib/Api/impl/TensorBase.h               | 14 +++----
 winml/lib/Common/inc/iengine.h                |  2 +-
 14 files changed, 123 insertions(+), 56 deletions(-)

diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
index a8f460b6d54d5..47b6e53bed0a0 100644
--- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h
+++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
@@ -100,6 +100,18 @@ struct OrtDmlApi {
     * This API gets the D3D12 resource when an OrtValue has been allocated by the DML EP.
     */
   ORT_API2_STATUS(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* provider, _In_ void* dml_resource, _Out_ ID3D12Resource** d3d_resource);
+
+  /**
+    * GetD3D12ResourceRegionFromAllocation
+    * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP.
+    * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed
+    */
+  ORT_API2_STATUS(GetD3D12ResourceRegionFromAllocation,
+    _In_ OrtAllocator* provider,
+    _In_ void* dml_resource,
+    _In_ uint64_t size_in_bytes,
+    _Out_ ID3D12Resource** d3d_resource,
+    _Out_ uint64_t* offset);
 };
 
 #ifdef __cplusplus
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index c062ff81d1330..7e1d9f80038d1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -7,6 +7,7 @@ interface IMLOperatorRegistry;
 #include "core/common/status.h"
 #include "core/framework/data_transfer.h"
 #include "IWinmlExecutionProvider.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h"
 
 namespace onnxruntime
 {
@@ -30,7 +31,7 @@ namespace Dml
         ID3D12CommandQueue* commandQueue,
         bool enableMetacommands = true);
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer);
+    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index d5c74e8499f89..d5f905819d0b9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -982,10 +982,10 @@ namespace Dml
         return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
     }
 
-    ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer)
+    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t sizeInBytes)
     {
         Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
-        return pAllocationInfo->GetAllocationInfo(taggedPointer)->GetUavResource();
+        return pAllocationInfo->CreateBufferRegion(taggedPointer, sizeInBytes);
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 9a3cc3f739356..2545c85d5fb14 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -220,17 +220,49 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
   }
 
+  // This should never happen since external users of the ORT API should only be able to create DML_EXTERNAL memory
+  if (wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DML_EXTERNAL) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The resource has been allocated with ");
+  }
+
+  *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
+  (*d3d_resource)->AddRef();
+
+#else
+  *d3d_resource = nullptr;
+#endif  // USE_DML
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation,
+    _In_ OrtAllocator* ort_allocator,
+    _In_ void* allocation,
+    _In_ uint64_t size_in_bytes,
+    _Out_ ID3D12Resource** d3d_resource,
+    _Out_ uint64_t* offset) {
+  API_IMPL_BEGIN
+#ifdef USE_DML
+  auto wrapping_allocator = static_cast<onnxruntime::OrtAllocatorImplWrappingIAllocator*>(ort_allocator);
+  auto allocator = wrapping_allocator->GetWrappedIAllocator();
+  if (!allocator) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
+  }
+
   if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) {
     *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
+    *offset = 0;
   } else {
     ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT);
-    *d3d_resource = Dml::GetD3D12ResourceFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation));
+    auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation), size_in_bytes);
+    *d3d_resource = bufferRegion.ResourceInUavState();
   }
 
   (*d3d_resource)->AddRef();
 
 #else
   *d3d_resource = nullptr;
+  *offset = 0;
 #endif  // USE_DML
   return nullptr;
   API_IMPL_END
@@ -241,7 +273,8 @@ static constexpr OrtDmlApi ort_dml_api_10_to_x = {
   &OrtSessionOptionsAppendExecutionProviderEx_DML,
   &CreateGPUAllocationFromD3DResource,
   &FreeGPUAllocation,
-  &GetD3D12ResourceFromAllocation
+  &GetD3D12ResourceFromAllocation,
+  &GetD3D12ResourceRegionFromAllocation,
 };
 
 const OrtDmlApi* GetOrtDmlApi(_In_ uint32_t /*version*/) NO_EXCEPTION {
diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp
index acda0d332b1cb..f3ffda496530f 100644
--- a/winml/adapter/winml_adapter_dml.cpp
+++ b/winml/adapter/winml_adapter_dml.cpp
@@ -89,7 +89,7 @@ ORT_API_STATUS_IMPL(winmla::OrtSessionOptionsAppendExecutionProviderEx_DML, _In_
   // lifetime and can be large, so shouldn't be rounded.
   // So we create the provider with rounding disabled, and expect the caller to enable it after.
   onnxruntime::DmlConfigureProviderFactoryDefaultRoundingMode(factory, AllocatorRoundingMode::Disabled);
-  
+
   onnxruntime::DmlConfigureProviderFactoryMetacommandsEnabled(factory, metacommands_enabled);
 #endif  // USE_DML
   return nullptr;
diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
index b5d0becf638d0..f0a7c601f665e 100644
--- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
+++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
@@ -122,6 +122,7 @@ class ConvertCPUTensorToVideoFrameWithSoftwareBitmapTelemetryEvent {
 };
 
 void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
+    _In_ uint64_t inputTensorOffset,
     _In_ UINT32 batchIdx,
     _In_ winml::LearningModelSession& session,
     _In_ ID3D12Resource* pInputTensor,
@@ -136,7 +137,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
   wgi::SoftwareBitmap softwareBitmap = destVideoFrame.SoftwareBitmap();
 
   if (softwareBitmap) {
-   ConvertGPUTensorToSoftwareBitmap(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap);
+   ConvertGPUTensorToSoftwareBitmap(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap);
   } else if (spDestDirect3DSurface) {
     bool isUAVSupportedFormat = _winmli::FormatSupportedForUAV(
         pDeviceCache->GetD3D12Device(),
@@ -144,7 +145,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
 
     // UAV support for formats is device dependent
     if (!isUAVSupportedFormat) {
-      ConvertDX12TensorToUnsupportedVideoFrameFormat(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame);
+      ConvertDX12TensorToUnsupportedVideoFrameFormat(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame);
     } else {
       ComPtr<ID3D11Texture2D> spVideoFrameTexture = _winmli::GetTextureFromDirect3DSurface(destVideoFrame.Direct3DSurface());
 
@@ -168,7 +169,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
         }
 
         // Detensorize
-        ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get());
+        ConvertGPUTensorToDX12Texture(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get());
 
         // Make sure that detensorization is done
         SyncD3D12ToD3D11(*pDeviceCache, D3D11_cached_texture_.Get());
@@ -196,7 +197,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
         }
 
         // Detensorize
-        ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get());
+        ConvertGPUTensorToDX12Texture(inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get());
 
         // Make sure that detensorization is done
         SyncD3D12ToD3D11(*pDeviceCache, spSharedD3D11Texture.Get());
@@ -241,6 +242,7 @@ ComPtr<ID3D12Resource> TensorToVideoFrameConverter::CreateShareableD3D12Texture(
 }
 
 void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat(
+    _In_ uint64_t input_tensor_offset,
     _In_ UINT32 batchIdx,
     _In_ ID3D12Resource* pInputTensor,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -288,7 +290,7 @@ void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat
   converted_video_frame_ = wm::VideoFrame::CreateWithDirect3D11Surface(surface);
 
   // Detensorize
-  ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get());
+  ConvertGPUTensorToDX12Texture(input_tensor_offset, batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get());
 
   // Wait for the D3D12 work to complete before using the resource
   SyncD3D12ToD3D11(device_cache, spSharedD3D11Texture.Get());
@@ -387,6 +389,7 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame(
 }
 
 void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
+    _In_ uint64_t inputTensorOffset,
     _In_ UINT32 batchIdx,
     _In_ ID3D12Resource* pInputResource,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -460,7 +463,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
 
   // Create SRV and UAV for input and output respectively
   {
-    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(batchIdx, inputDesc, tensorDesc);
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(inputTensorOffset, batchIdx, inputDesc, tensorDesc);
     CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize);
     spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle);
 
@@ -545,6 +548,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
 }
 
 void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
+    _In_ uint64_t inputTensorOffset,
     _In_ UINT32 batchIdx,
     _In_ ID3D12Resource* pInputTensor,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -579,7 +583,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
   auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pInputTensor, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   command_list_->ResourceBarrier(1, &barrier);
 
-  command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, singleVideoFramebufferSize * batchIdx, singleVideoFramebufferSize);
+  command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, inputTensorOffset + singleVideoFramebufferSize * batchIdx, singleVideoFramebufferSize);
 
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
@@ -645,6 +649,7 @@ void TensorToVideoFrameConverter::ConvertBatchedDX12TensorToBuffers(
 }
 
 D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor(
+    uint64_t offset,
     const UINT32 batchIdx,
     const D3D12_RESOURCE_DESC& resourceDesc,
     const _winml::ImageTensorDescription& desc) {
@@ -655,7 +660,7 @@ D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor
   srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
   srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
   UINT singleImageSize = static_cast<UINT>(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]);
-  srvDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
+  srvDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
   srvDesc.Buffer.NumElements = singleImageSize;
   srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE;
 
@@ -736,10 +741,10 @@ void TensorToVideoFrameConverter::ConvertCPUTensorToSoftwareBitmap(
   if (tensorDesc.dataType == kImageTensorDataTypeFloat32) {
     WINML_THROW_IF_FAILED(CpuDetensorizer::Detensorize<float>(
       tensorDesc.channelType,
-      targetChannelType, 
-      tensorDesc.pixelRange, 
+      targetChannelType,
+      tensorDesc.pixelRange,
       static_cast<float*>(pCPUTensor),
-      bufferWidth, 
+      bufferWidth,
       height,
       width,
       pData));
@@ -754,4 +759,4 @@ void TensorToVideoFrameConverter::ConvertCPUTensorToSoftwareBitmap(
       width,
       pData));
   }
-}
\ No newline at end of file
+}
diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index 1215548d212c5..2ed8f04bbcb3b 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -188,6 +188,7 @@ ComPtr<ID3D12Resource> VideoFrameToTensorConverter::ShareD3D11Texture(ID3D11Text
 }
 
 void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
+    _In_ uint64_t outputTensorOffset,
     _In_ const UINT32 batchIdx,
     _In_ winml::LearningModelSession& session,
     _In_ const wm::IVideoFrame& inputVideoFrame,
@@ -206,7 +207,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
   wgdx::Direct3D11::IDirect3DSurface spDirect3DSurface = inputVideoFrame.Direct3DSurface();
 
   if (inputVideoFrame.SoftwareBitmap()) {
-    ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, pOutputTensor);
+    ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, outputTensorOffset, pOutputTensor);
   } else if (spDirect3DSurface) {
     ComPtr<ID3D11Texture2D> spVideoFrameTexture;
     wgi::BitmapBounds scaledBounds = inputBounds;
@@ -278,7 +279,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
 
     // We cropped the texture, shared it and converted it to a known color format, so it's time to tensorize
     // TODO: merge all videoframes to a single DX12Texture Resource before call ConvertDX12TextureToGPUTensor.
-    ConvertDX12TextureToGPUTensor(batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor);
+    ConvertDX12TextureToGPUTensor(outputTensorOffset, batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor);
   } else {
     // Invalid video frame
     WINML_THROW_IF_FAILED(E_INVALIDARG);
@@ -286,6 +287,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
 }
 
 void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
+    _In_ uint64_t output_resource_offset,
     _In_ UINT32 batchIdx,
     _In_ ID3D12Resource* pInputResource,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -339,7 +341,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
     UINT64 ullTensorSize = 0;
     WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize));
 
-    if (outputDesc.Width < ullTensorSize ||
+    if (outputDesc.Width < output_resource_offset + ullTensorSize ||
         outputDesc.Height != 1 ||
         outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
         !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)) {
@@ -381,7 +383,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
     CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize);
     spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle);
 
-    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(batchIdx, outputDesc, tensorDesc);
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(output_resource_offset, batchIdx, tensorDesc);
     CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), UavBufferIdx, srvUavDescriptorSize);
     spDx12Device->CreateUnorderedAccessView(pOutputResource, nullptr, &uavDesc, uavHandle);
   }
@@ -458,6 +460,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
     _In_ _winml::D3DDeviceCache& device_cache,
     _In_ const wgi::BitmapBounds& inputBounds,
     _In_ const ImageTensorDescription& tensorDesc,
+    _In_ uint64_t outputResourceOffset,
     _Inout_ ID3D12Resource* pOutputResource) {
   assert(pOutputResource != nullptr);
   assert(videoFrame.SoftwareBitmap() != nullptr);
@@ -495,8 +498,6 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
 
   assert(convertedSoftwareBitmap != nullptr);
 
-  D3D12_RESOURCE_DESC outputDesc = pOutputResource->GetDesc();
-
   uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
   uint32_t bufferSize = static_cast<uint32_t>(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize);
 
@@ -526,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST);
   command_list_->ResourceBarrier(1, &barrier);
 
-  command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize);
+  command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), outputResourceOffset, bufferSize);
 
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
@@ -578,8 +579,8 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor(
 }
 
 D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescription(
+    uint64_t offset,
     const UINT32 batchIdx,
-    const D3D12_RESOURCE_DESC& resourceDesc,
     const _winml::ImageTensorDescription& desc) {
   UINT uiTensorElementSize =
       desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t);
@@ -587,7 +588,7 @@ D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescripti
   D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
   uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
   UINT singleImageSize = static_cast<UINT>(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]);
-  uavDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
+  uavDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
   uavDesc.Buffer.NumElements = singleImageSize;
   uavDesc.Buffer.CounterOffsetInBytes = 0;
   uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
diff --git a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
index b7b8333313054..8dac4cd9bd458 100644
--- a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
+++ b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
@@ -15,6 +15,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
   // Function takes in a tensor DX12 Resource all compute ops should be completed
   // converts it to a VideoFrame backed by either a SoftwareBitmap or D3DSurface
   void DX12TensorToVideoFrame(
+      _In_ uint64_t inputTensorOffset,
       _In_ UINT32 batch_index,
       _In_ winml::LearningModelSession& session,
       _In_ ID3D12Resource* input_tensor,
@@ -47,6 +48,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
   Microsoft::WRL::ComPtr<ID3D11Texture2D> ShareD3D12Texture(ID3D12Resource* pResource, ID3D11Device* pDevice);
 
   void ConvertGPUTensorToSoftwareBitmap(
+      _In_ uint64_t inputTensorOffset,
       _In_ UINT32 batch_index,
       _In_ ID3D12Resource* input_tensor,
       _In_ _winml::D3DDeviceCache& device_cache,
@@ -54,6 +56,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
       _Inout_ wgi::SoftwareBitmap& software_bitmap);
 
   void ConvertGPUTensorToDX12Texture(
+      _In_ uint64_t inputTensorOffset,
       _In_ UINT32 batch_index,
       _In_ ID3D12Resource* input_resource,
       _In_ _winml::D3DDeviceCache& device_cache,
@@ -61,6 +64,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
       _Inout_ ID3D12Resource* output_resource);
 
   void ConvertDX12TensorToUnsupportedVideoFrameFormat(
+      _In_ uint64_t input_tensor_offset,
       _In_ UINT32 batch_index,
       _In_ ID3D12Resource* input_tensor,
       _In_ _winml::D3DDeviceCache& device_cache,
@@ -68,6 +72,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
       _Inout_ wm::VideoFrame& unsupported_video_frame);
 
   static D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor(
+      uint64_t offset,
       const UINT32 batch_index,
       const D3D12_RESOURCE_DESC& resource_description,
       const ImageTensorDescription& description);
@@ -81,4 +86,4 @@ class TensorToVideoFrameConverter : public ImageConverter {
       const D3D11_TEXTURE2D_DESC& d3d11Desc,
       ID3D12Device* d3d12Device);
 };
-}  // namespace _winml
\ No newline at end of file
+}  // namespace _winml
diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
index 4f0a010cc367f..e69f929936f2e 100644
--- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
+++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
@@ -21,6 +21,7 @@ class VideoFrameToTensorConverter : public ImageConverter {
   // {upperleft X, upperleft Y, width, height} to be turned into a tensor.
   // If the region of interest is the entire VideoFrame, the input BitmapBounds should describe the entire image.
   void VideoFrameToDX12Tensor(
+      _In_ uint64_t output_tensor_offset,
       _In_ const UINT32 batch_index,
       _In_ winml::LearningModelSession& session,
       _In_ const wm::IVideoFrame& input_video_frame,
@@ -61,9 +62,11 @@ class VideoFrameToTensorConverter : public ImageConverter {
       _In_ _winml::D3DDeviceCache& device_cache,
       _In_ const wgi::BitmapBounds& input_bounds,
       _In_ const ImageTensorDescription& tensor_description,
+      _In_ uint64_t outputResourceOffset,
       _Inout_ ID3D12Resource* pOutputResource);
 
   void ConvertDX12TextureToGPUTensor(
+      _In_ uint64_t output_resource_offset,
       _In_ const UINT32 batch_index,
       _In_ ID3D12Resource* pInputResource,
       _In_ _winml::D3DDeviceCache& device_cache,
@@ -71,8 +74,8 @@ class VideoFrameToTensorConverter : public ImageConverter {
       _Inout_ ID3D12Resource* output_resource);
 
   static D3D12_UNORDERED_ACCESS_VIEW_DESC CreateUAVDescription(
+      uint64_t offset,
       const UINT32 batch_index,
-      const D3D12_RESOURCE_DESC& resource_description,
       const ImageTensorDescription& description);
 
   static void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor(
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
index e294c91afc079..c285ca3646b1d 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
@@ -165,7 +165,7 @@ static auto GetStrings(const OrtApi* ort_api, const OrtValue* ort_value,
   return std::make_shared<std::pair<decltype(strings), decltype(buffer)>>(std::move(strings), std::move(buffer));
 }
 
-HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) {
+HRESULT OnnxruntimeValue::GetResource(uint64_t size_in_bytes, _winml::Resource& out, uint64_t& offset) {
   auto ort_api = engine_->GetEngineFactory()->UseOrtApi();
 
   void* mutable_data = nullptr;
@@ -188,7 +188,7 @@ HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) {
     auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator);
 
     winrt::com_ptr<ID3D12Resource> resource;
-    RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, resource.put()),
+    RETURN_HR_IF_NOT_OK_MSG(ort_dml_api->GetD3D12ResourceRegionFromAllocation(allocator.get(), mutable_data, size_in_bytes, resource.put(), &offset),
                             ort_api);
     out = _winml::Resource(resource.get(), [](void*) { /*do nothing, as this pointer is actually a com pointer! */ });
   } else {
@@ -1296,10 +1296,11 @@ HRESULT OnnxruntimeEngine::FillFromMapValue(IInspectable* map, winml::TensorKind
   std::vector<int64_t> keys_shape;
   keys_value->GetTensorShape(keys_shape);
 
+  uint64_t offset = 0;
   _winml::Resource keys_data;
-  RETURN_IF_FAILED(keys_value->GetResource(keys_data));
+  RETURN_IF_FAILED(keys_value->GetResource(0, keys_data, offset));
   _winml::Resource values_data;
-  RETURN_IF_FAILED(values_value->GetResource(values_data));
+  RETURN_IF_FAILED(values_value->GetResource(0, values_data, offset));
 
   auto num_elements = static_cast<size_t>(ShapeSize(keys_shape.data(), keys_shape.size()));
   GetAbiMapFiller(key_kind, value_kind)(map, num_elements, keys_data.get(), values_data.get());
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h
index 097941e78f1a5..7c53886da9821 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h
@@ -32,7 +32,7 @@ class OnnxruntimeValue : public Microsoft::WRL::RuntimeClass<
   STDMETHOD(IsCpu)
   (bool* out) override;
   STDMETHOD(GetResource)
-  (_winml::Resource& resource) override;
+  (uint64_t size_in_bytes, _winml::Resource& resource, uint64_t& offset) override;
   STDMETHOD(IsTensor)
   (bool* out) override;
   STDMETHOD(IsOfTensorType)
diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp
index 1b8be103c0e5a..6622893b88a06 100644
--- a/winml/lib/Api/ImageFeatureValue.cpp
+++ b/winml/lib/Api/ImageFeatureValue.cpp
@@ -214,12 +214,12 @@ static _winml::ImageTensorDescription CreateImageTensorDescriptor(winml::TensorK
     THROW_HR(E_NOTIMPL);
   }
 
-  if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 && 
+  if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 &&
       pixelRange != winml::LearningModelPixelRange::ZeroToOne &&
       pixelRange != winml::LearningModelPixelRange::MinusOneToOne) {
     THROW_HR(E_NOTIMPL);
   }
-  
+
   tensorDescription.pixelRange = pixelRange;
   tensorDescription.sizes[2] = height;
   tensorDescription.sizes[3] = width;
@@ -275,6 +275,7 @@ static void GPUTensorize(
     _winml::ImageTensorDescription tensorDescriptor,
     com_ptr<LearningModelSession> spSession,
     ID3D12Resource* d3dResource,
+    uint64_t resourceOffset,
     _winml::BindingContext& context) {
   auto spDevice = spSession->Device().as<LearningModelDevice>();
 
@@ -291,6 +292,7 @@ static void GPUTensorize(
       // Apply tensorization
       auto session = spSession.as<winml::LearningModelSession>();
       pooledConverter->Get()->Tensorizer->VideoFrameToDX12Tensor(
+          resourceOffset,
           batchIdx,
           session,
           videoFrames.GetAt(batchIdx),
@@ -417,7 +419,7 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
   } else {
     THROW_HR(WINML_ERR_INVALID_BINDING);
   }
-  
+
   //NCHW layout
   auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), pixelRange.value(), m_batchSize, descriptorWidth, descriptorHeight);
 
@@ -447,21 +449,23 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa
         winml::TensorKind::Float : winml::TensorKind::Float16,
       value.put()));
 
+  auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast<int64_t>(1), std::multiplies<int64_t>());
+  auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize;
+
   // Get the tensor raw data
   _winml::Resource void_resource;
-  RETURN_IF_FAILED(value->GetResource(void_resource));
+  uint64_t offset = 0;
+  RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset));
 
   if (context.type == _winml::BindingType::kInput) {
     // Only tensorize inputs
-    auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast<int64_t>(1), std::multiplies<int64_t>());
-    auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize;
     auto singleFrameBufferSize = bufferByteSize / m_batchSize;
     if (spDevice->IsCpuDevice()) {
       auto resource = reinterpret_cast<BYTE*>(void_resource.get());
       CPUTensorize(m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, static_cast<unsigned int>(singleFrameBufferSize));
     } else {
       auto resource = reinterpret_cast<ID3D12Resource*>(void_resource.get());
-      GPUTensorize(m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, context);
+      GPUTensorize(m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, offset, context);
     }
   }
 
@@ -481,14 +485,18 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont
   auto spSession = context.session.as<LearningModelSession>();
   auto spDevice = spSession->Device().as<LearningModelDevice>();
 
-  // Get the output tensor raw data
-  _winml::Resource void_resource;
-  RETURN_IF_FAILED(value->GetResource(void_resource));
-
   // Get the run context
   auto metadata = GetInputMetadata(context);
   ImageResourceMetadata resourceMetadata = metadata.value();
 
+  auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast<int64_t>(1), std::multiplies<int64_t>());
+  auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize;
+
+  // Get the output tensor raw data
+  _winml::Resource void_resource;
+  uint64_t offset = 0;
+  RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset));
+
   _winml::ConverterResourceDescription descriptor = {};
   descriptor.width = static_cast<int>(resourceMetadata.TensorDescriptor.sizes[3]);
   descriptor.height = static_cast<int>(resourceMetadata.TensorDescriptor.sizes[2]);
@@ -500,9 +508,6 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont
 
     auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->DetensorizerStore()->Fetch(descriptor));
 
-    auto bufferSize = std::accumulate(std::begin(resourceMetadata.TensorDescriptor.sizes), std::end(resourceMetadata.TensorDescriptor.sizes), static_cast<int64_t>(1), std::multiplies<int64_t>());
-    auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize;
-
     BYTE* resource = reinterpret_cast<BYTE*>(void_resource.get());
     for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) {
       // Convert Software Tensor to VideoFrame one by one based on the buffer size.
@@ -521,6 +526,7 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont
     for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) {
       auto videoFrame = m_videoFrames.GetAt(batchIdx);
       pooledConverter->Get()->Detensorizer->DX12TensorToVideoFrame(
+          offset,
           batchIdx,
           context.session,
           d3dResource,
diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h
index 0d210eb2d7694..02c837c494a7e 100644
--- a/winml/lib/Api/impl/TensorBase.h
+++ b/winml/lib/Api/impl/TensorBase.h
@@ -148,7 +148,7 @@ struct TensorBase : TBase {
 
     // If there is no matching gpu resource, then fallback to a cpu resource
     if (CpuTensor() != nullptr) {
-      auto num_backing_buffers = CpuTensor()->num_buffers(); 
+      auto num_backing_buffers = CpuTensor()->num_buffers();
       if (num_backing_buffers == 1) {
         // If we have a single backing cpu buffer, there is no need to create GPU resources.
         // The engine will use the buffer provided, and perform the needed copies into the GPU context as needed.
@@ -360,11 +360,13 @@ struct TensorBase : TBase {
         resources_,
         "The tensor has been closed and its resources have been detached during evaluation!");
 
-    _winml::Resource updated_resource;
-    RETURN_IF_FAILED(value->GetResource(updated_resource));
-
     // get the shape
     RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!");
+    auto buffer_size_in_bytes = static_cast<size_t>(ShapeSize(shape_)) * sizeof(T);
+
+    _winml::Resource updated_resource;
+    uint64_t offset = 0;
+    RETURN_IF_FAILED(value->GetResource(buffer_size_in_bytes, updated_resource, offset));
 
     bool is_cpu;
     bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu;
@@ -406,8 +408,6 @@ struct TensorBase : TBase {
                              "Failed to prepare buffer for copy back from device resource.");
         RETURN_IF_FAILED(engine->CopyValueAcrossDevices(value, dest.get()));
       } else {
-        auto buffer_size_in_bytes = static_cast<size_t>(ShapeSize(shape_)) * sizeof(T);
-
         _winml::ConverterResourceDescription descriptor = {};
         descriptor.pixel_format = static_cast<DWORD>(wgdx::DirectXPixelFormat::Unknown);
         descriptor.luid = device->GetD3DDevice()->GetAdapterLuid();  // Converted image on GPU
@@ -526,7 +526,7 @@ struct TensorBase : TBase {
   }
   WINML_CATCH_ALL
 
-  
+
   // ITensor<T>::CreateFromBatchedBuffersInternal
   static typename TBase::class_type CreateFromBatchedBuffersInternal(
       std::vector<int64_t> shape,
diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h
index 1db8bc4568aac..a686b585841c3 100644
--- a/winml/lib/Common/inc/iengine.h
+++ b/winml/lib/Common/inc/iengine.h
@@ -19,7 +19,7 @@ IValue : IUnknown {
   (bool* out) PURE;
 
   STDMETHOD(GetResource)
-  (_winml::Resource & resource) PURE;
+  (uint64_t size_in_bytes, _winml::Resource & resource, uint64_t& offset) PURE;
 
   STDMETHOD(IsTensor)
   (bool* out) PURE;

From 93d931b5cb4e16ac36bf7457c40698cc3ffe4696 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 1 Feb 2023 15:29:10 -0800
Subject: [PATCH 28/76] WIP

---
 .../providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 9dfc1672708c0..502bb187bb0db 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -65,6 +65,7 @@ namespace Dml
             if (persistentResourceSize > 0)
             {
                 auto buffer = m_provider->AllocatePooledResource(persistentResourceSize);
+                m_persistentResource = buffer.ResourceInUavState();
                 m_persistentResourceBinding = buffer.GetBufferBinding();
                 m_managedPersistentBuffer = wil::MakeOrThrow<DmlManagedBuffer>(std::move(buffer));
                 m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get());

From 9c03955a2bd2d80f26afe1eddc6639623110680b Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 26 Apr 2023 16:23:42 -0700
Subject: [PATCH 29/76] Add hack to work around OOM errors with upload heaps

---
 .../src/PooledUploadHeap.cpp                  | 44 +++++--------------
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp
index db5fd301cfdd0..442b3e7ddf746 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp
@@ -99,36 +99,27 @@ namespace Dml
         auto heap = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
         auto buffer = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes);
 
-        ORT_THROW_IF_FAILED(device->CreateCommittedResource(
+        HRESULT hr = device->CreateCommittedResource(
             &heap,
             D3D12_HEAP_FLAG_NONE,
             &buffer,
             D3D12_RESOURCE_STATE_GENERIC_READ,
             nullptr,
-            IID_GRAPHICS_PPV_ARGS(uploadBuffer.ReleaseAndGetAddressOf())));
+            IID_GRAPHICS_PPV_ARGS(uploadBuffer.ReleaseAndGetAddressOf()));
+
+        if (hr == DXGI_ERROR_DEVICE_REMOVED)
+        {
+            ORT_THROW_IF_FAILED(device->GetDeviceRemovedReason());
+        }
+        ORT_THROW_IF_FAILED(hr);
 
         return Chunk{ sizeInBytes, std::move(uploadBuffer) };
     }
 
     std::pair<PooledUploadHeap::Chunk*, size_t> PooledUploadHeap::Reserve(size_t sizeInBytes)
     {
-        // Try to find a chunk with enough free space to accommodate the requested allocation size
-        for (Chunk& chunk : m_chunks)
-        {
-            std::optional<size_t> offsetForAllocation = FindOffsetForAllocation(chunk, sizeInBytes);
-            if (offsetForAllocation)
-            {
-                // There's enough space in this chunk - return
-                return std::make_pair(&chunk, *offsetForAllocation);
-            }
-        }
-
-        // No chunks were able to accommodate the allocation - create a new chunk and return that instead
-
         // At least double the capacity of the pool
-        const size_t newChunkSize = std::max({ m_totalCapacity, c_minChunkSize, sizeInBytes });
-        m_chunks.push_back(CreateChunk(m_device.Get(), newChunkSize));
-        m_totalCapacity += newChunkSize;
+        m_chunks.push_back(CreateChunk(m_device.Get(), sizeInBytes));
 
         // Allocate from the beginning of the new chunk
         return std::make_pair(&m_chunks.back(), 0);
@@ -206,13 +197,6 @@ namespace Dml
             return c.allocations.empty();
         });
         m_chunks.erase(it, m_chunks.end());
-
-        // Re-calculate total capacity
-        m_totalCapacity = 0;
-        for (const auto& chunk : m_chunks)
-        {
-            m_totalCapacity += chunk.capacityInBytes;
-        }
     }
 
     void PooledUploadHeap::AssertInvariants()
@@ -224,7 +208,7 @@ namespace Dml
         };
 
         // Chunks should be sorted by ascending capacity
-        assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer));
+        // assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer));
 
         // Allocations in a chunk should be sorted by ascending fence value
         for (const auto& chunk : m_chunks)
@@ -270,14 +254,6 @@ namespace Dml
             }
         }
 
-        // Validate total capacity of pool
-        size_t calculatedCapacity = 0;
-        for (const auto& chunk : m_chunks)
-        {
-            calculatedCapacity += chunk.capacityInBytes;
-        }
-        assert(calculatedCapacity == m_totalCapacity);
-
     #endif // #ifdef _DEBUG
     }
 } // namespace Dml

From e34abafb9015f3c55c32e951ad96001658a02f81 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 5 Jul 2023 17:00:43 -0700
Subject: [PATCH 30/76] Fix DFT

---
 .../src/Operators/DmlDFT.h                    | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index c69fcdf035e21..33fef8c4c0d04 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -402,16 +402,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             auto outputDims = GetTensorDimensions(outputTensor.Get());
             ORT_THROW_HR_IF(E_FAIL, inputDims.size() != outputDims.size());
 
-            ComPtr<IUnknown> inputUnknown;
-            ComPtr<ID3D12Resource> inputResource;
-            inputTensor->GetDataInterface(inputUnknown.GetAddressOf());
-            ORT_THROW_IF_FAILED(inputUnknown.As(&inputResource));
-
-            ComPtr<IUnknown> outputUnknown;
-            ComPtr<ID3D12Resource> outputResource;
-            outputTensor->GetDataInterface(outputUnknown.GetAddressOf());
-            ORT_THROW_IF_FAILED(outputUnknown.As(&outputResource));
-
             // Get optional dft_length input
             uint32_t dftLength = inputDims[onnxruntime::narrow<size_t>(m_axis)];
             ComPtr<IMLOperatorTensor> dftLengthTensor;
@@ -685,8 +675,10 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             // Padding should be handled by the shader.
             PrepareStockhamFFTParams(
                 context,
-                inputBufferRegion, inputDims,
-                zChirpBufferRegion, params.BluesteinZChirpParams.AFFT.Sizes,
+                inputBufferRegion,
+                inputDims,
+                aFFTBufferRegion,
+                params.BluesteinZChirpParams.AFFT.Sizes,
                 M,
                 m_axis,
                 1,
@@ -698,8 +690,10 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             // Therefore the window function logic shold hangle complex multiplication, and B_FTT should be used like a window function.
             PrepareStockhamFFTParams(
                 context,
-                zChirpBufferRegion, params.BluesteinZChirpParams.AFFT.Sizes,
-                outputBufferRegion, outputDims,
+                aFFTBufferRegion,
+                params.BluesteinZChirpParams.AFFT.Sizes,
+                outputBufferRegion,
+                outputDims,
                 M,
                 1,
                 m_axis,
@@ -715,8 +709,10 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             // The BFFT call takes input B, and produces output B_FFT.
             PrepareStockhamFFTParams(
                 context,
-                bBufferRegion, params.BluesteinZChirpParams.B.Sizes,
-                bFFTBufferRegion, params.BluesteinZChirpParams.BFFT.Sizes,
+                bBufferRegion,
+                params.BluesteinZChirpParams.B.Sizes,
+                bFFTBufferRegion,
+                params.BluesteinZChirpParams.BFFT.Sizes,
                 M,
                 2,
                 2,

From 00708a6a34cf6e02a1906d053093ea11ed9065f3 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 5 Jul 2023 18:56:24 -0700
Subject: [PATCH 31/76] Register external allocator

---
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp         | 3 ++-
 .../providers/dml/DmlExecutionProvider/src/ExecutionProvider.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 871551597f468..33c2a99d19e79 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -212,9 +212,10 @@ namespace Dml
             m_context->SetAllocator(m_gpuAllocator);
             // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators.
             m_cpuInputAllocator = std::make_shared<DmlCpuAllocator>(OrtMemType::OrtMemTypeCPUInput);
+            m_externalGpuAllocator = std::make_shared<DmlExternalGpuAllocator>();
         }
 
-        return std::vector<onnxruntime::AllocatorPtr>{m_gpuAllocator, m_cpuInputAllocator,};
+        return std::vector<onnxruntime::AllocatorPtr>{m_gpuAllocator, m_externalGpuAllocator, m_cpuInputAllocator};
     }
 
     HRESULT __stdcall ExecutionProviderImpl::GetD3DDevice(_COM_Outptr_ ID3D12Device** d3dDevice) const noexcept
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 1ddf1ca5dfe30..a959042dab32c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -28,6 +28,7 @@ namespace Dml
     class DmlCpuAllocator;
     class ExecutionProvider;
     class DmlGpuAllocator;
+    class DmlExternalGpuAllocator;
     struct TaggedPointer;
 
     class ExecutionProviderImpl : public WRL::Base<Dml::IExecutionProvider,
@@ -183,6 +184,7 @@ namespace Dml
         std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
         std::shared_ptr<onnxruntime::IAllocator> m_bfcAllocator;
         std::shared_ptr<DmlGpuAllocator> m_gpuAllocator;
+        std::shared_ptr<DmlExternalGpuAllocator> m_externalGpuAllocator;
         std::shared_ptr<DmlCpuAllocator> m_cpuInputAllocator;
         std::shared_ptr<onnxruntime::KernelRegistry> m_kernelRegistry;
         std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfoMap> m_internalRegInfoMap;

From 9927336dd4526b3a346b6be8b98b9ee59eda6b70 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 6 Jul 2023 08:54:54 -0700
Subject: [PATCH 32/76] Fix DFT and STFT

---
 .../src/Operators/DmlDFT.h                    | 14 ++---
 .../src/Operators/DmlSTFT.h                   | 59 ++++++++-----------
 2 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 33fef8c4c0d04..ed1a6ebe49171 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -838,15 +838,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[TSize+1];
-
-        std::transform(
-            bufferRegions.begin(), bufferRegions.end(),
-            uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
-        uav_barriers[TSize] = CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr);
-        commandList->ResourceBarrier(TSize, uav_barriers);
-
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -895,7 +886,10 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(3, uav_barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index fc7684242b290..cd1f78e2a23a6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -192,9 +192,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         ComPtr<ID3D12DescriptorHeap> descriptorHeap;
         ComPtr<IDMLBindingTable> bindingTable;
         ComPtr<IDMLCommandRecorder> commandRecorder;
-        ComPtr<ID3D12Resource> persistentResource;
-        ComPtr<IUnknown> persistentResourcePoolingUnk;
-        std::optional<DML_BUFFER_BINDING> persistentResourceBinding;
+        std::optional<Dml::DmlBuffer> persistentBufferRegion;
         bool hasWindowTensor = false;
         uint64_t signalBufferSizeInBytes = 0;
         uint64_t windowBufferSizeInBytes = 0;
@@ -315,20 +313,29 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         // Initialize
         {
+            std::vector<DML_BUFFER_BINDING> initializationInputBindings(params.hasWindowTensor ? 2 : 1);
+
             uint64_t persistentResourceSize = m_framingOperator.op->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
             {
-                auto buffer = m_dmlProvider->AllocatePooledResource(persistentResourceSize);
-                m_framingOperator.persistentResource = buffer.ResourceInUavState();
-                m_framingOperator.persistentResourceBinding = buffer.GetBufferBinding();
+                m_framingOperator.persistentBufferRegion = m_dmlProvider->AllocatePooledResource(persistentResourceSize);
+                auto binding = m_framingOperator.persistentBufferRegion->GetBufferBinding();
+                ORT_THROW_IF_FAILED(m_dmlProvider->InitializeOperator(
+                    m_framingOperator.op.Get(),
+                    &binding,
+                    gsl::make_span(initializationInputBindings)
+                ));
+            }
+            else
+            {
+                ORT_THROW_IF_FAILED(m_dmlProvider->InitializeOperator(
+                    m_framingOperator.op.Get(),
+                    nullptr,
+                    gsl::make_span(initializationInputBindings)
+                ));
             }
 
-            std::vector<DML_BUFFER_BINDING> initializationInputBindings(params.hasWindowTensor ? 2 : 1);
-            ORT_THROW_IF_FAILED(m_dmlProvider->InitializeOperator(
-                m_framingOperator.op.Get(),
-                m_framingOperator.persistentResourceBinding ? &*m_framingOperator.persistentResourceBinding : nullptr,
-                gsl::make_span(initializationInputBindings)
-            ));
+
         }
 
         auto execBindingProps = m_framingOperator.op->GetBindingProperties();
@@ -398,11 +405,6 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         std::array<DML_BINDING_DESC, 2> inputBindings;
         uint32_t inputBindingsCount = 1;
 
-        // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking
-        // barrierCount is outside the valid range.
-        D3D12_RESOURCE_BARRIER barriers[3];
-        uint32_t barrierCount = 0;
-
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
@@ -435,31 +437,22 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         auto persistentBufferSize = bindingProps.PersistentResourceSize;
         if (persistentBufferSize > 0)
         {
-            DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &*m_framingOperator.persistentResourceBinding };
+            assert(m_framingOperator.persistentBufferRegion.has_value());
+            auto persistentResourceBinding = m_framingOperator.persistentBufferRegion->GetBufferBinding();
+            DML_BINDING_DESC bindingDesc = { DML_BINDING_TYPE_BUFFER, &persistentResourceBinding };
             m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc);
         }
 
-        // Transition resources COMMON -> UAV
-        D3D12_RESOURCE_BARRIER uav_barriers[4];
-        uav_barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(signalBufferRegion.ResourceInUavState());
-        uav_barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(windowBufferRegion.ResourceInUavState());
-        uav_barriers[2] = CD3DX12_RESOURCE_BARRIER::UAV(outputBufferRegion.ResourceInUavState());
-        uav_barriers[3] = CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr);
-        commandList->ResourceBarrier(barrierCount, barriers);
-
         m_framingOperator.commandRecorder->RecordDispatch(
             commandList,
             m_framingOperator.op.Get(),
             m_framingOperator.bindingTable.Get()
         );
 
-        // Transition resources UAV -> COMMON
-        for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++)
-        {
-            std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter);
-        }
-
-        commandList->ResourceBarrier(barrierCount, barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 

From c20690fee00160e5a8a39374605e8574cbf983a9 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 6 Jul 2023 09:47:49 -0700
Subject: [PATCH 33/76] Grid sample

---
 .../src/Operators/DmlGridSample.h             | 125 ++++--------------
 1 file changed, 29 insertions(+), 96 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index c63863853fb4e..8863bd5362d27 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -4,6 +4,7 @@
 #include "../MLOperatorAuthorImpl.h"
 
 #include "../External/D3DX12/d3dx12.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h"
 #include <d3d12.h>
 
 // NOTE: When this operator's implementation is moved into DML, the associated FP16 fallback
@@ -329,15 +330,6 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
     ComPtr<ID3D12PipelineState> m_gridSamplePipelineState;
     DmlGridSampleParameters m_params = {};
 
-
-    // Allocate temporary buffers if needed
-    struct ResourceDesc
-    {
-        ComPtr<ID3D12Resource> Resource;
-        std::array<uint32_t, 4> Sizes;
-        std::array<uint32_t, 4> Strides;
-    };
-
     struct GridSampleShaderConstants
     {
         uint32_t StartIndex;
@@ -623,29 +615,18 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             auto gridDims = GetTensorDimensions(gridTensor.Get());
             auto outputDims = GetTensorDimensions(outputTensor.Get());
 
-            ComPtr<IUnknown> inputUnknown;
-            ComPtr<ID3D12Resource> inputResource;
-            inputTensor->GetDataInterface(inputUnknown.GetAddressOf());
-            ORT_THROW_IF_FAILED(inputUnknown.As(&inputResource));
-
-            ComPtr<IUnknown> gridUnknown;
-            ComPtr<ID3D12Resource> gridResource;
-            gridTensor->GetDataInterface(gridUnknown.GetAddressOf());
-            ORT_THROW_IF_FAILED(gridUnknown.As(&gridResource));
-
-            ComPtr<IUnknown> outputUnknown;
-            ComPtr<ID3D12Resource> outputResource;
-            outputTensor->GetDataInterface(outputUnknown.GetAddressOf());
-            ORT_THROW_IF_FAILED(outputUnknown.As(&outputResource));
+            auto inputTensorWrapper = static_cast<Windows::AI::MachineLearning::Adapter::TensorWrapper*>(inputTensor.Get());
+            auto gridTensorWrapper = static_cast<Windows::AI::MachineLearning::Adapter::TensorWrapper*>(gridTensor.Get());
+            auto outputTensorWrapper = static_cast<Windows::AI::MachineLearning::Adapter::TensorWrapper*>(outputTensor.Get());
 
             return Compute(
                 commandList.Get(),
                 context,
-                inputResource.Get(),
+                inputTensorWrapper->GetBufferRegion(),
                 inputDims,
-                gridResource.Get(),
+                gridTensorWrapper->GetBufferRegion(),
                 gridDims,
-                outputResource.Get(),
+                outputTensorWrapper->GetBufferRegion(),
                 outputDims
             );
         }
@@ -660,21 +641,21 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
     HRESULT Compute(
         ID3D12GraphicsCommandList* commandList,
         IMLOperatorKernelContext* context,
-        ID3D12Resource* inputResource,
+        const Dml::D3D12BufferRegion& inputBufferRegion,
         gsl::span<const uint32_t> inputDims,
-        ID3D12Resource* gridResource,
+        const Dml::D3D12BufferRegion& gridBufferRegion,
         gsl::span<const uint32_t> gridDims,
-        ID3D12Resource* outputResource,
+        const Dml::D3D12BufferRegion& outputBufferRegion,
         gsl::span<const uint32_t> outputDims)
     {
         try
         {
             GridSample(
-                inputResource,
+                inputBufferRegion,
                 inputDims,
-                gridResource,
+                gridBufferRegion,
                 gridDims,
-                outputResource,
+                outputBufferRegion,
                 outputDims,
                 commandList);
         }
@@ -687,11 +668,11 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
     }
 
     void GridSample(
-        ID3D12Resource* inputResource,
+        const Dml::D3D12BufferRegion& inputBufferRegion,
         gsl::span<const uint32_t> inputDims,
-        ID3D12Resource* gridResource,
+        const Dml::D3D12BufferRegion& gridBufferRegion,
         gsl::span<const uint32_t> gridDims,
-        ID3D12Resource* outputResource,
+        const Dml::D3D12BufferRegion& outputBufferRegion,
         gsl::span<const uint32_t> outputDims,
         ID3D12GraphicsCommandList* commandList)
     {
@@ -702,33 +683,6 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         Dml::GetDescendingPackedStrides(gridDims, gridStrides);
         Dml::GetDescendingPackedStrides(outputDims, outputStrides);
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[3];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputResource,
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridResource,
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputResource,
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        inputResource->SetName(L"InputResource");
-        outputResource->SetName(L"OutputResource");
-        gridResource->SetName(L"GridResource");
-
-        commandList->ResourceBarrier(3, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get());
         commandList->SetPipelineState(m_gridSamplePipelineState.Get());
@@ -747,29 +701,13 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::copy(outputStrides.begin(), outputStrides.end(), constants.OutputStrides);
 
         constants.ElementCount = ComputeElementCountFromDimensions(constants.OutputSizes);
-        std::array<ID3D12Resource*, 3> uav_resources = { inputResource, gridResource, outputResource };
-        Dispatch(uav_resources, constants, commandList);
-
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-                inputResource,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                D3D12_RESOURCE_STATE_COMMON
-                );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-                gridResource,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                D3D12_RESOURCE_STATE_COMMON
-                );
+        std::array<Dml::D3D12BufferRegion, 3> uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion };
+        Dispatch(uavBufferRegions, constants, commandList);
 
-        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-                outputResource,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                D3D12_RESOURCE_STATE_COMMON
-                );
-
-        commandList->ResourceBarrier(3, barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -782,25 +720,17 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
 
     template <typename TConstants, uint32_t TSize>
     void Dispatch(
-        std::array<ID3D12Resource*, TSize>& resources,
+        std::array<Dml::D3D12BufferRegion, TSize>& bufferRegions,
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
-
-        std::transform(
-            resources.begin(), resources.end(),
-            uav_barriers,
-            [](auto& resource) { return CD3DX12_RESOURCE_BARRIER::UAV(resource); } );
-        commandList->ResourceBarrier(TSize, uav_barriers);
-
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
-            if (resources[i]) {
+            if (bufferRegions[i]) {
                 commandList->SetComputeRootUnorderedAccessView(
                     i, // root parameter index
-                    resources[i]->GetGPUVirtualAddress()
+                    bufferRegions[i].ResourceInUavState()->GetGPUVirtualAddress() + bufferRegions[i].Offset()
                 );
             }
             else
@@ -842,7 +772,10 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(2, uav_barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 

From 0bb51245fcb75467e151daf30dc4c5193e9707f8 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 6 Jul 2023 18:31:34 -0700
Subject: [PATCH 34/76] Fix WinML API

---
 onnxruntime/core/providers/dml/dml_provider_factory.cc | 1 +
 winml/lib/Api.Image/VideoFrameToTensorConverter.cpp    | 2 +-
 winml/lib/Api/LearningModelBinding.cpp                 | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 2545c85d5fb14..91279be185ba9 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -255,6 +255,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation,
   } else {
     ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT);
     auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation), size_in_bytes);
+    *offset = bufferRegion.Offset();
     *d3d_resource = bufferRegion.ResourceInUavState();
   }
 
diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index 00aff0b740dd4..c223f44f1282b 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -527,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST);
   command_list_->ResourceBarrier(1, &barrier);
 
-  command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), outputResourceOffset, bufferSize);
+  command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx + outputResourceOffset, upload_heap_.Get(), 0, bufferSize);
 
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
diff --git a/winml/lib/Api/LearningModelBinding.cpp b/winml/lib/Api/LearningModelBinding.cpp
index 0c79117a345dd..dda127298530b 100644
--- a/winml/lib/Api/LearningModelBinding.cpp
+++ b/winml/lib/Api/LearningModelBinding.cpp
@@ -138,7 +138,7 @@ std::tuple<std::string, winrt::com_ptr<_winml::IValue>, _winml::BindingType> Lea
   // Hold onto the input output providers so that our memory doesnt get destroyed!
   auto providerInfo = ProviderInfo{inspectable, spLotusValueProvider, context};
   CacheProvider(name, providerInfo);
-  
+
   return std::make_tuple(name, value, bindingType);
 }
 
@@ -480,7 +480,7 @@ STDMETHODIMP LearningModelBinding::Bind(
     auto session = m_session.as<winmlp::LearningModelSession>();
     auto device = m_session.Device().as<winmlp::LearningModelDevice>();
     CWinMLAutoLock lock(!device->IsCpuDevice() ? session->GetDMLEPLock() : nullptr);
-    
+
     _winmlt::TelemetryEvent binding_event(_winmlt::EventCategory::kBinding);
     _winml::BindingType binding_type;
     std::string binding_name;
@@ -613,4 +613,4 @@ void LearningModelBinding::BindUnboundOutputs() {
   }
 }
 
-}  // namespace WINMLP
\ No newline at end of file
+}  // namespace WINMLP

From 6bc50491a28d7838a90c06355a7ca0235923f4e8 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 7 Jul 2023 09:04:29 -0700
Subject: [PATCH 35/76] Fix ImageTests.SynchronizeGPUWorkloads test failure

---
 .../dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp | 3 +++
 winml/lib/Api.Image/VideoFrameToTensorConverter.cpp            | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index e2ae8f23a3744..9b39fe9758876 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -395,6 +395,9 @@ namespace Dml
         size_in_bytes =
             (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
 
+        // Make sure the region we're trying to create fits entirely in the resource
+        assert(it->second->GetUavResource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes);
+
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index c223f44f1282b..5536f7df203b7 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -588,7 +588,7 @@ D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescripti
   D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
   uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
   UINT singleImageSize = static_cast<UINT>(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]);
-  uavDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
+  uavDesc.Buffer.FirstElement = offset / uiTensorElementSize + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
   uavDesc.Buffer.NumElements = singleImageSize;
   uavDesc.Buffer.CounterOffsetInBytes = 0;
   uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;

From 14d1c9658095898cc6cace897915b71601c79d81 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 7 Jul 2023 19:21:07 -0700
Subject: [PATCH 36/76] Fix ConcurrencyTests.MultiThreadSingleSessionGpu

---
 .../src/BucketizedBufferAllocator.cpp                     | 3 +--
 .../DmlExecutionProvider/src/BucketizedBufferAllocator.h  | 2 +-
 .../dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp    | 2 +-
 .../dml/DmlExecutionProvider/src/DmlAllocationInfo.h      | 8 --------
 .../dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp | 2 +-
 .../dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp     | 6 ++++++
 .../dml/DmlExecutionProvider/src/DmlTaggedPointer.h       | 1 +
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp    | 6 ++----
 8 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 9b39fe9758876..5c488e4376733 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -289,7 +289,6 @@ namespace Dml
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
             this,
             ++m_currentAllocationId,
-            ++m_currentResourceId,
             resourceWrapper.Get(),
             size_in_bytes
         );
@@ -327,7 +326,7 @@ namespace Dml
         allocations_by_id_.erase(it);
     }
 
-    void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId)
+    void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo)
     {
         // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already
         // close at this point if the application is winding down.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 5af97b1cb53c7..73e7a0a317984 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -66,7 +66,7 @@ namespace Dml
 
         AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
-        void FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId);
+        void FreeResource(AllocationInfo* allocInfo);
         uint64_t ComputeRequiredSize(size_t size);
         bool TilingEnabled() const { return tiling_enabled_; };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
index 044e9e854d700..a9560c0bd3c9a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
@@ -12,7 +12,7 @@ namespace Dml
     {
         if (m_owner)
         {
-            m_owner->FreeResource(this, m_pooledResourceId);
+            m_owner->FreeResource(this);
         }
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 977de7c4887e2..492f87c77f1d0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -16,12 +16,10 @@ namespace Dml
         AllocationInfo(
             BucketizedBufferAllocator* owner,
             size_t id,
-            uint64_t pooledResourceId,
             DmlResourceWrapper* resourceWrapper,
             size_t requestedSize)
             : m_owner(owner)
             , m_allocationId(id)
-            , m_pooledResourceId(pooledResourceId)
             , m_resourceWrapper(resourceWrapper)
             , m_requestedSize(requestedSize)
         {}
@@ -63,15 +61,9 @@ namespace Dml
             return m_allocationId;
         }
 
-        uint64_t GetPooledResourceId() const
-        {
-            return m_pooledResourceId;
-        }
-
     private:
         BucketizedBufferAllocator* m_owner;
         size_t m_allocationId; // For debugging purposes
-        uint64_t m_pooledResourceId = 0;
         Microsoft::WRL::ComPtr<DmlResourceWrapper> m_resourceWrapper;
 
         // The size requested during Alloc(), which may be smaller than the physical resource size
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index e7d5b9d13fd78..cd6b241e70d48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -102,7 +102,7 @@ namespace DmlGraphFusionHelper
         {
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
-            *allocId = allocInfo->GetPooledResourceId();
+            *allocId = 0;
             return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
         }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
index da5ed6df2ff4c..8f503566768a1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
@@ -38,4 +38,10 @@ namespace Dml
 
     return reinterpret_cast<void*>(ptr);
 }
+
+uint64_t TaggedPointer::GetUniqueId() const
+{
+    return reinterpret_cast<uint64_t>(TaggedPointer::Pack(device_id, allocation_id, offset));
+}
+
 } // namespace tfdml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
index 96b0eb318ad48..ee58e23a6396f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
@@ -29,6 +29,7 @@ struct TaggedPointer
         uint32_t allocation_id,
         uint64_t offset);
     static TaggedPointer Unpack(const void* ptr);
+    uint64_t GetUniqueId() const;
 };
 
 static_assert(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 33c2a99d19e79..831266cdd5bff 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -1040,7 +1040,7 @@ namespace Dml
     uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator)
     {
         assert(!isInternalOperator);
-        return m_gpuAllocator->GetAllocationInfo(taggedPointer)->GetPooledResourceId();
+        return taggedPointer.GetUniqueId();
     }
 
     void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState(
@@ -1193,12 +1193,10 @@ namespace Dml
 
     void* CreateGPUAllocationFromD3DResource(ID3D12Resource* pResource)
     {
-        uint64_t pooledResourceId = 0; // Not a pooled resource
-
         ComPtr<DmlResourceWrapper> resourceWrapper;
         wil::MakeOrThrow<DmlCommittedResourceWrapper>(pResource).As(&resourceWrapper);
 
-        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
+        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(nullptr, 0, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
         return allocInfo.Detach();
     }
     void FreeGPUAllocation(void* ptr)

From a2809af25df6f81618a112af20597056e3d31205 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 11 Jul 2023 06:51:00 -0700
Subject: [PATCH 37/76] Add print statements for CopyBufferRegion

---
 .../dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index af625334b7720..ab4d4f29abed9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -173,6 +173,7 @@ void DmlCommandRecorder::CopyBufferRegion(
     uint64_t srcOffset,
     uint64_t byteCount)
 {
+    printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
     m_currentCommandList->CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount);
     m_operationsRecordedInCurrentCommandList = true;
 }

From 2f8bff801822ff0d21263f29d691f79099ecfadd Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 11 Jul 2023 06:52:48 -0700
Subject: [PATCH 38/76] Add print statements for CopyBufferRegion

---
 .../dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp         | 1 -
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp          | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index ab4d4f29abed9..af625334b7720 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -173,7 +173,6 @@ void DmlCommandRecorder::CopyBufferRegion(
     uint64_t srcOffset,
     uint64_t byteCount)
 {
-    printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
     m_currentCommandList->CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount);
     m_operationsRecordedInCurrentCommandList = true;
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 831266cdd5bff..afe90d9a25f1a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -476,6 +476,8 @@ namespace Dml
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
+            printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
+
             //
             // GPU -> GPU copy
             //

From c024d0ad0e37d819bc248e40b6b50762ae73b5ff Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 11 Jul 2023 10:38:13 -0700
Subject: [PATCH 39/76] Use Identity for the copy operator

---
 .../src/Operators/DmlOperatorCopy.cpp         | 28 ++++---------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index 4ca51633d23e7..f8ef496b74d9b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -29,30 +29,14 @@ class DmlOperatorCopy : public DmlOperator
         ComPtr<IMLOperatorKernelCreationContextPrivate> contextPrivate;
         ORT_THROW_IF_FAILED(kernelInfo.GetInterface()->QueryInterface(contextPrivate.GetAddressOf()));
 
-        if (contextPrivate->IsDmlGraphNode())
-        {
-            std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
-            std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
-            DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {};
-            opDesc.InputTensor = inputDescs.data();
-            opDesc.OutputTensor = outputDescs.data();
+        DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {};
+        opDesc.InputTensor = inputDescs.data();
+        opDesc.OutputTensor = outputDescs.data();
 
-            SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo);
-        }
-    }
-
-    void Compute(const MLOperatorKernelContext& kernelContext)
-    {
-        MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
-
-        // Reshape the output tensor.
-        MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
-
-        // Copy elements from input tensor to output tensor.
-        ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
-            outputTensor.GetInterface().Get(),
-            inputTensor.GetInterface().Get()));
+        SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo);
     }
 };
 

From fef7df27290799bf33a480c054123e3ff25cdd82 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 11 Jul 2023 14:37:45 -0700
Subject: [PATCH 40/76] Add intermediate buffer for copying

---
 .../core/providers/dml/dml_provider_factory.h | 18 ++---
 .../DmlExecutionProvider/src/CommandQueue.cpp | 14 ++--
 .../src/DmlAllocationInfo.cpp                 |  2 +-
 .../src/DmlAllocationInfo.h                   |  8 +-
 .../src/DmlBfcAllocator.h                     |  6 +-
 .../src/DmlCommandRecorder.h                  |  2 +-
 .../src/DmlExternalGpuAllocator.h             |  2 +-
 .../src/DmlGpuAllocator.cpp                   |  4 +-
 .../src/DmlGpuAllocator.h                     |  6 +-
 ...pp => DmlReservedResourceSubAllocator.cpp} | 36 ++++-----
 ...or.h => DmlReservedResourceSubAllocator.h} | 18 ++---
 .../src/ExecutionContext.cpp                  | 73 +++++++++++++++++++
 .../src/ExecutionProvider.cpp                 |  4 +-
 .../src/ExecutionProvider.h                   |  4 +-
 14 files changed, 135 insertions(+), 62 deletions(-)
 rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{BucketizedBufferAllocator.cpp => DmlReservedResourceSubAllocator.cpp} (90%)
 rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{BucketizedBufferAllocator.h => DmlReservedResourceSubAllocator.h} (92%)

diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
index cbb6c6d2c9198..2ec3a10b08aed 100644
--- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h
+++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
@@ -101,16 +101,16 @@ struct OrtDmlApi {
   ORT_API2_STATUS(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* provider, _In_ void* dml_resource, _Out_ ID3D12Resource** d3d_resource);
 
   /**
-    * GetD3D12ResourceRegionFromAllocation
-    * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP.
-    * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed
-    */
+   * GetD3D12ResourceRegionFromAllocation
+   * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP.
+   * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed
+   */
   ORT_API2_STATUS(GetD3D12ResourceRegionFromAllocation,
-    _In_ OrtAllocator* provider,
-    _In_ void* dml_resource,
-    _In_ uint64_t size_in_bytes,
-    _Out_ ID3D12Resource** d3d_resource,
-    _Out_ uint64_t* offset);
+                  _In_ OrtAllocator* provider,
+                  _In_ void* dml_resource,
+                  _In_ uint64_t size_in_bytes,
+                  _Out_ ID3D12Resource** d3d_resource,
+                  _Out_ uint64_t* offset);
 };
 
 #ifdef __cplusplus
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
index 5516fc62cdda0..e5084772d4063 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -46,12 +46,12 @@ namespace Dml
         return GpuEvent{ m_lastFenceValue + 1, m_fence };
     }
 
-    void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) 
+    void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork)
     {
-        // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK 
-        // to queue additional references at this time, since those references would be leaked. This 
-        // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; 
-        // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference 
+        // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK
+        // to queue additional references at this time, since those references would be leaked. This
+        // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference;
+        // for example, an allocation from DmlReservedResourceSubAllocator attempts to queue a reference
         // to its underlying D3D resource when freed. Furthermore, these references are unnecessary
         // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences.
         if (!m_closing)
@@ -68,7 +68,7 @@ namespace Dml
             m_queuedReferences.push_back(queuedReference);
         }
     }
-    
+
     void CommandQueue::Close()
     {
         // Wait for flushed work:
@@ -79,7 +79,7 @@ namespace Dml
         m_queuedReferences.clear();
         m_closing = false;
     }
-    
+
     void CommandQueue::ReleaseCompletedReferences()
     {
         uint64_t completedValue = GetFence()->GetCompletedValue();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
index a9560c0bd3c9a..52944d2c8b96a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
@@ -3,7 +3,7 @@
 
 #include "precomp.h"
 #include "DmlAllocationInfo.h"
-#include "BucketizedBufferAllocator.h"
+#include "DmlReservedResourceSubAllocator.h"
 
 namespace Dml
 {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 492f87c77f1d0..546a42342a2a0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -7,14 +7,14 @@
 
 namespace Dml
 {
-    class BucketizedBufferAllocator;
+    class DmlReservedResourceSubAllocator;
 
     class AllocationInfo : public Microsoft::WRL::RuntimeClass<
         Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
     {
     public:
         AllocationInfo(
-            BucketizedBufferAllocator* owner,
+            DmlReservedResourceSubAllocator* owner,
             size_t id,
             DmlResourceWrapper* resourceWrapper,
             size_t requestedSize)
@@ -26,7 +26,7 @@ namespace Dml
 
         ~AllocationInfo();
 
-        BucketizedBufferAllocator* GetOwner() const
+        DmlReservedResourceSubAllocator* GetOwner() const
         {
             return m_owner;
         }
@@ -62,7 +62,7 @@ namespace Dml
         }
 
     private:
-        BucketizedBufferAllocator* m_owner;
+        DmlReservedResourceSubAllocator* m_owner;
         size_t m_allocationId; // For debugging purposes
         Microsoft::WRL::ComPtr<DmlResourceWrapper> m_resourceWrapper;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
index 17ba37146bdc5..43e093538fcb6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
@@ -4,14 +4,14 @@
 #pragma once
 
 #include "core/framework/allocator.h"
-#include "BucketizedBufferAllocator.h"
+#include "DmlReservedResourceSubAllocator.h"
 
 namespace Dml
 {
     class DmlBfcAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlBfcAllocator(std::shared_ptr<BucketizedBufferAllocator> subAllocator)
+        DmlBfcAllocator(std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator)
         : onnxruntime::IAllocator(
             OrtMemoryInfo(
                 onnxruntime::DML,
@@ -24,6 +24,6 @@ namespace Dml
         void* Alloc(size_t size_in_bytes) final { return m_subAllocator->Alloc(size_in_bytes); }
         void Free(void* ptr) final { m_subAllocator->Free(ptr); }
     private:
-        std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::shared_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index e442df1f1df6c..090166f296ffd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -11,7 +11,7 @@
 namespace Dml
 {
     class CommandQueue;
-    class BucketizedBufferAllocator;
+    class DmlReservedResourceSubAllocator;
 
     class DmlCommandRecorder : public ICommandRecorder
     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
index 6c5ee8cd29c6e..1c4d4b36628eb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
@@ -7,7 +7,7 @@
 
 namespace Dml
 {
-    class BucketizedBufferAllocator;
+    class DmlReservedResourceSubAllocator;
     class AllocationInfo;
     struct TaggedPointer;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 5bee9ee34ec4d..f2b62f2d41e64 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -6,12 +6,12 @@
 #include "precomp.h"
 #include "DmlGpuAllocator.h"
 #include "core/framework/allocator.h"
-#include "BucketizedBufferAllocator.h"
+#include "DmlReservedResourceSubAllocator.h"
 #include "DmlTaggedPointer.h"
 
 namespace Dml
 {
-    DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator)
+    DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator)
     : onnxruntime::IAllocator(
         OrtMemoryInfo(
             onnxruntime::DML,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index 3bc8127598460..39311055503d2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -9,14 +9,14 @@
 
 namespace Dml
 {
-    class BucketizedBufferAllocator;
+    class DmlReservedResourceSubAllocator;
     class AllocationInfo;
     struct TaggedPointer;
 
     class DmlGpuAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<BucketizedBufferAllocator> subAllocator);
+        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator);
 
         void* Alloc(size_t size_in_bytes) final;
         void Free(void* ptr) final;
@@ -32,6 +32,6 @@ namespace Dml
 
         // This allocator is specific to DML and is used to decode the opaque data returned by the BFC
         // allocator into objects that DML understands
-        std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::shared_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
similarity index 90%
rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index 5c488e4376733..c82e0a4f5d722 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -4,18 +4,18 @@
 #include "precomp.h"
 
 #include "core/session/onnxruntime_c_api.h"
-#include "BucketizedBufferAllocator.h"
+#include "DmlReservedResourceSubAllocator.h"
 #include "DmlReservedResourceWrapper.h"
 #include "DmlBufferRegion.h"
 
 namespace Dml
 {
-    BucketizedBufferAllocator::~BucketizedBufferAllocator()
+    DmlReservedResourceSubAllocator::~DmlReservedResourceSubAllocator()
     {
 #ifdef PRINT_OUTSTANDING_ALLOCATIONS
         if (!m_outstandingAllocationsById.empty())
         {
-            printf("BucketizedBufferAllocator outstanding allocation indices:\n");
+            printf("DmlReservedResourceSubAllocator outstanding allocation indices:\n");
             for (auto& entry : m_outstandingAllocationsById)
             {
                 printf("%u\n", static_cast<int>(entry.first));
@@ -25,7 +25,7 @@ namespace Dml
 #endif
     }
 
-    /*static*/ gsl::index BucketizedBufferAllocator::GetBucketIndexFromSize(uint64_t size)
+    /*static*/ gsl::index DmlReservedResourceSubAllocator::GetBucketIndexFromSize(uint64_t size)
     {
         assert(size != 0);
 
@@ -40,12 +40,12 @@ namespace Dml
         return index;
     }
 
-    /*static*/ uint64_t BucketizedBufferAllocator::GetBucketSizeFromIndex(gsl::index index)
+    /*static*/ uint64_t DmlReservedResourceSubAllocator::GetBucketSizeFromIndex(gsl::index index)
     {
         return (1ull << (index + c_minResourceSizeExponent));
     }
 
-    void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
+    void DmlReservedResourceSubAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
         m_defaultRoundingMode = roundingMode;
     }
@@ -66,10 +66,10 @@ namespace Dml
 
     static uint64_t GetMaxHeapSizeInTiles()
     {
-        return BucketizedBufferAllocator::kDefaultMaxHeapSizeInTiles;
+        return DmlReservedResourceSubAllocator::kDefaultMaxHeapSizeInTiles;
     }
 
-    BucketizedBufferAllocator::BucketizedBufferAllocator(
+    DmlReservedResourceSubAllocator::DmlReservedResourceSubAllocator(
         ID3D12Device* device,
         std::shared_ptr<ExecutionContext> context,
         ID3D12CommandQueue* queue,
@@ -89,7 +89,7 @@ namespace Dml
     {
     }
 
-    absl::optional<DmlHeapAllocation> BucketizedBufferAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes)
+    absl::optional<DmlHeapAllocation> DmlReservedResourceSubAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes)
     {
         DmlHeapAllocation allocation = {};
 
@@ -207,7 +207,7 @@ namespace Dml
         return allocation;
     }
 
-    absl::optional<DmlHeapAllocation> BucketizedBufferAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes)
+    absl::optional<DmlHeapAllocation> DmlReservedResourceSubAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes)
     {
         DmlHeapAllocation allocation = {};
 
@@ -256,7 +256,7 @@ namespace Dml
         return allocation;
     }
 
-    uint64_t BucketizedBufferAllocator::ComputeRequiredSize(size_t size)
+    uint64_t DmlReservedResourceSubAllocator::ComputeRequiredSize(size_t size)
     {
         const uint64_t resource_size_in_tiles =
             1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
@@ -266,7 +266,7 @@ namespace Dml
         return resource_size_in_bytes;
     }
 
-    void* BucketizedBufferAllocator::Alloc(size_t size_in_bytes)
+    void* DmlReservedResourceSubAllocator::Alloc(size_t size_in_bytes)
     {
         // For some reason lotus likes requesting 0 bytes of memory
         size_in_bytes = std::max<size_t>(1, size_in_bytes);
@@ -307,7 +307,7 @@ namespace Dml
         return TaggedPointer::Pack(device_id, *allocationId, offset);
     }
 
-    void BucketizedBufferAllocator::Free(void* ptr)
+    void DmlReservedResourceSubAllocator::Free(void* ptr)
     {
         ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
 
@@ -326,7 +326,7 @@ namespace Dml
         allocations_by_id_.erase(it);
     }
 
-    void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo)
+    void DmlReservedResourceSubAllocator::FreeResource(AllocationInfo* allocInfo)
     {
         // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already
         // close at this point if the application is winding down.
@@ -344,7 +344,7 @@ namespace Dml
         }
     }
 
-    absl::optional<uint32_t> BucketizedBufferAllocator::TryReserveAllocationID()
+    absl::optional<uint32_t> DmlReservedResourceSubAllocator::TryReserveAllocationID()
     {
         // The mutex must already be held
         assert(!mutex_.try_lock());
@@ -369,7 +369,7 @@ namespace Dml
         return current_allocation_id_;
     }
 
-    void BucketizedBufferAllocator::ReleaseAllocationID(uint32_t id)
+    void DmlReservedResourceSubAllocator::ReleaseAllocationID(uint32_t id)
     {
         // The mutex must already be held
         assert(!mutex_.try_lock());
@@ -378,7 +378,7 @@ namespace Dml
         free_allocation_ids_.push_back(id);
     }
 
-    D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(
+    D3D12BufferRegion DmlReservedResourceSubAllocator::CreateBufferRegion(
         const TaggedPointer& taggedPointer,
         uint64_t size_in_bytes)
     {
@@ -405,7 +405,7 @@ namespace Dml
             it->second->GetCopyDstResource());
     }
 
-    AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
+    AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
     {
         // We need to access (mutable) state after this point, so we need to lock
         std::unique_lock<std::mutex> lock(mutex_);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
similarity index 92%
rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 73e7a0a317984..1d7c8704ab7da 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -9,8 +9,8 @@
 
 namespace Dml
 {
-    class BucketizedBufferAllocator;
-    class BucketizedBufferAllocator;
+    class DmlReservedResourceSubAllocator;
+    class DmlReservedResourceSubAllocator;
     struct TaggedPointer;
 
     // An allocator that makes logically contiguous allocations backed by D3D heaps.
@@ -36,7 +36,7 @@ namespace Dml
     // this case it is better make more but smaller allocations (resulting in
     // smaller heaps); this fallback path is only retained as a last resort for
     // older hardware.
-    class BucketizedBufferAllocator
+    class DmlReservedResourceSubAllocator
     {
     public:
         // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
@@ -44,7 +44,7 @@ namespace Dml
         // local video memory fragmentation without requiring lots of heaps.
         static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512;
 
-        BucketizedBufferAllocator(
+        DmlReservedResourceSubAllocator(
             ID3D12Device* device,
             std::shared_ptr<ExecutionContext> context,
             ID3D12CommandQueue* queue,
@@ -70,14 +70,14 @@ namespace Dml
         uint64_t ComputeRequiredSize(size_t size);
         bool TilingEnabled() const { return tiling_enabled_; };
 
-        ~BucketizedBufferAllocator();
+        ~DmlReservedResourceSubAllocator();
 
-        // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties,
+        // Constructs a DmlReservedResourceSubAllocator which allocates D3D12 committed resources with the specified heap properties,
         // resource flags, and initial resource state.
-        BucketizedBufferAllocator(
+        DmlReservedResourceSubAllocator(
             ID3D12Device* device,
             std::shared_ptr<ExecutionContext> context,
-            std::unique_ptr<BucketizedBufferAllocator>&& subAllocator);
+            std::unique_ptr<DmlReservedResourceSubAllocator>&& subAllocator);
 
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
         void* Alloc(size_t size);
@@ -109,7 +109,7 @@ namespace Dml
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled;
-        std::unique_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::unique_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
 
     #if _DEBUG
         // Useful for debugging; keeps track of all allocations that haven't been freed yet
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index 6a30e6cd1ad56..d6a46e354c769 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -37,6 +37,78 @@ namespace Dml
 
         SetCommandRecorder(&m_dmlRecorder);
 
+        // This type of copy is not common and is only used in rare circumstances. Because a resource
+        // cannot be both in a source and destination state at the same time (without aliasing), we copy
+        // the source resource to an intermediate one, and then copy the intermediate resource to the
+        // destination resource.
+        // TODO (pavignol): Only do the intermediate copy when both resources at the same
+
+        D3D12_HEAP_PROPERTIES heapProperties = {
+            D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+
+        D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER,
+                                            0,
+                                            byteCount,
+                                            1,
+                                            1,
+                                            1,
+                                            DXGI_FORMAT_UNKNOWN,
+                                            {1, 0},
+                                            D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                                            D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
+
+        ComPtr<ID3D12Resource> intermediateBuffer;
+        ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource(
+            &heapProperties,
+            D3D12_HEAP_FLAG_NONE,
+            &resourceDesc,
+            D3D12_RESOURCE_STATE_COPY_DEST,
+            nullptr,
+            IID_GRAPHICS_PPV_ARGS(intermediateBuffer.GetAddressOf())));
+
+        std::vector<D3D12_RESOURCE_BARRIER> barriers;
+
+        if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE))
+        {
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE));
+            m_dmlRecorder.ResourceBarrier(barriers);
+        }
+
+        m_dmlRecorder.CopyBufferRegion(intermediateBuffer.Get(), 0, srcBuffer, srcOffset, byteCount);
+
+        // Reset src barrier state
+        for (auto& barrier : barriers)
+        {
+            std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
+        }
+
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(intermediateBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE));
+
+        if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
+        {
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST));
+        }
+
+        m_dmlRecorder.ResourceBarrier(barriers);
+        m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount);
+
+        barriers.clear();
+
+        // Reset dst barrier state
+        if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
+        {
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
+        }
+
+        // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
+        m_dmlRecorder.ResourceBarrier(barriers);
+
+        // Keep the intermediate buffer alive until we're done with it
+        QueueReference(intermediateBuffer.Get());
+
+
+/*
         std::vector<D3D12_RESOURCE_BARRIER> barriers;
 
         if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
@@ -65,6 +137,7 @@ namespace Dml
         // aliasing barrier
         barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
+*/
     }
 
     void ExecutionContext::FillBufferWithPattern(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index afe90d9a25f1a..f72035f5e5fda 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -8,7 +8,7 @@
 #include "PooledUploadHeap.h"
 #include "ReadbackHeap.h"
 #include "ExecutionContext.h"
-#include "BucketizedBufferAllocator.h"
+#include "DmlReservedResourceSubAllocator.h"
 #include "DmlCpuAllocator.h"
 #include "MLOperatorAuthorImpl.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h"
@@ -190,7 +190,7 @@ namespace Dml
     std::vector<onnxruntime::AllocatorPtr> ExecutionProviderImpl::CreatePreferredAllocators() {
         if (!m_gpuAllocator)
         {
-            auto subAllocator = std::make_shared<BucketizedBufferAllocator>(
+            auto subAllocator = std::make_shared<DmlReservedResourceSubAllocator>(
                 m_d3d12Device.Get(),
                 m_context, // TODO(leca): REVIEW: Will it cause memory issue when m_context is released in EP while alloc is released in sessionState?
                 m_queue.Get(),
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index a959042dab32c..ad208ea830ae5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -24,7 +24,7 @@ namespace Dml
     class PooledUploadHeap;
     class ReadbackHeap;
     class ExecutionContext;
-    class BucketizedBufferAllocator;
+    class DmlReservedResourceSubAllocator;
     class DmlCpuAllocator;
     class ExecutionProvider;
     class DmlGpuAllocator;
@@ -181,7 +181,7 @@ namespace Dml
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<PooledUploadHeap> m_uploadHeap;
         std::unique_ptr<ReadbackHeap> m_readbackHeap;
-        std::shared_ptr<BucketizedBufferAllocator> m_subAllocator;
+        std::shared_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
         std::shared_ptr<onnxruntime::IAllocator> m_bfcAllocator;
         std::shared_ptr<DmlGpuAllocator> m_gpuAllocator;
         std::shared_ptr<DmlExternalGpuAllocator> m_externalGpuAllocator;

From e0569c593e11ff577c27f8b1d620ac0b5ab919b3 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 11 Jul 2023 17:58:29 -0700
Subject: [PATCH 41/76] Remove aliasing

---
 .../src/DmlAllocationInfo.h                   | 10 ---
 .../DmlExecutionProvider/src/DmlBuffer.cpp    | 10 ---
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  3 +-
 .../src/DmlBufferRegion.cpp                   | 76 +++--------------
 .../src/DmlBufferRegion.h                     | 34 ++------
 .../src/DmlCommandRecorder.cpp                | 15 +---
 .../src/DmlCommittedResourceWrapper.h         |  6 --
 .../src/DmlGraphFusionHelper.cpp              |  2 +-
 .../src/DmlReservedResourceSubAllocator.cpp   |  4 +-
 .../src/DmlReservedResourceSubAllocator.h     |  4 +-
 .../src/DmlReservedResourceWrapper.h          |  6 --
 .../src/DmlResourceWrapper.h                  |  6 +-
 .../src/ExecutionContext.cpp                  |  3 -
 .../src/ExecutionProvider.cpp                 | 78 +++++-------------
 .../src/FusedGraphKernel.cpp                  |  4 +-
 .../src/MLOperatorAuthorImpl.cpp              | 12 +--
 .../src/MLOperatorAuthorImpl.h                |  4 +-
 .../src/Operators/DmlDFT.h                    | 81 ++++++++++++++++++-
 .../src/Operators/DmlGridSample.h             | 60 ++++++++++++--
 .../src/Operators/DmlSTFT.h                   | 22 ++++-
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  4 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |  2 +-
 22 files changed, 201 insertions(+), 245 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 546a42342a2a0..7c11358bb106d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -36,16 +36,6 @@ namespace Dml
             return m_resourceWrapper->GetUavResource();
         }
 
-        ID3D12Resource* GetCopySrcResource() const
-        {
-            return m_resourceWrapper->GetCopySrcResource();
-        }
-
-        ID3D12Resource* GetCopyDstResource() const
-        {
-            return m_resourceWrapper->GetCopyDstResource();
-        }
-
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
         {
             return std::move(m_resourceWrapper);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index c5fa576d24a0f..464ce26c16f54 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -48,16 +48,6 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const
     return buffer_region_.ResourceInUavState();
 }
 
-ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const
-{
-    return buffer_region_.ResourceInCopySrcState();
-}
-
-ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const
-{
-    return buffer_region_.ResourceInCopyDstState();
-}
-
 uint64_t DmlBuffer::Offset() const
 {
     return buffer_region_ ? buffer_region_.Offset() : 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index b98ae727e1a65..4b0dd58ce4467 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -27,9 +27,8 @@ class DmlBuffer
     DmlBuffer(DmlBuffer&&);
     DmlBuffer& operator=(DmlBuffer&&);
 
+    // TODO (pavignol): Rename to Resource()
     ID3D12Resource* ResourceInUavState() const;
-    ID3D12Resource* ResourceInCopySrcState() const;
-    ID3D12Resource* ResourceInCopyDstState() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
     const D3D12BufferRegion& Region() const { return buffer_region_; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index 3240042b5b6a6..c33cc5491c7f0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -7,114 +7,64 @@
 namespace Dml
 {
 
-    D3D12BufferRegion::D3D12BufferRegion(
-        uint64_t offset,
-        uint64_t size_in_bytes,
-        ID3D12Resource* resource_uav_state,
-        ID3D12Resource* resource_copy_src_state,
-        ID3D12Resource* resource_copy_dst_state)
-        : resource_uav_state_(resource_uav_state),
-        resource_copy_src_state_(resource_copy_src_state),
-        resource_copy_dst_state_(resource_copy_dst_state),
+    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource)
+        : m_resource(resource),
         offset_(offset),
         size_in_bytes_(size_in_bytes)
     {
-        // Get a raw pointer to the first non-null resource passed in. At least one
-        // resource must be provided.
-        first_valid_resource_ = resource_uav_state_;
-        if (!first_valid_resource_)
-        {
-            first_valid_resource_ = resource_copy_src_state_;
-        }
-        if (!first_valid_resource_)
-        {
-            first_valid_resource_ = resource_copy_dst_state_;
-        }
-        ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr);
+        ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr);
 
         // Regions cannot be empty.
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
 
         // Regions cannot extend beyond the size of the resource.
-        uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
+        uint64_t buffer_size = m_resource->GetDesc().Width;
         ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
 
         // All three resources, if provided, must be identical aside from state.
-        assert(
-            first_valid_resource_->GetDesc().Dimension ==
-            D3D12_RESOURCE_DIMENSION_BUFFER);
-        assert(
-            !resource_uav_state ||
-            (resource_uav_state->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_uav_state->GetDesc().Width == buffer_size));
-        assert(
-            !resource_copy_src_state_ ||
-            (resource_copy_src_state_->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_copy_src_state_->GetDesc().Width == buffer_size));
-        assert(
-            !resource_copy_dst_state_ ||
-            (resource_copy_dst_state_->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_copy_dst_state_->GetDesc().Width == buffer_size));
+        assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
+        assert(m_resource->GetDesc().Width == buffer_size);
     }
 
     D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
     {
-        std::swap(this->resource_uav_state_, that.resource_uav_state_);
-        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
-        std::swap(this->first_valid_resource_, that.first_valid_resource_);
     }
 
     D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
     {
-        std::swap(this->resource_uav_state_, that.resource_uav_state_);
-        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
-        std::swap(this->first_valid_resource_, that.first_valid_resource_);
         return *this;
     }
 
     ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
     {
-        return resource_uav_state_;
-    }
-
-    ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const
-    {
-        return resource_copy_src_state_;
-    }
-
-    ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const
-    {
-        return resource_copy_dst_state_;
+        return m_resource;
     }
 
     uint64_t D3D12BufferRegion::Offset() const
     {
-        return first_valid_resource_ ? offset_ : 0;
+        return m_resource ? offset_ : 0;
     }
 
     uint64_t D3D12BufferRegion::SizeInBytes() const
     {
-        return first_valid_resource_ ? size_in_bytes_ : 0;
+        return m_resource ? size_in_bytes_ : 0;
     }
 
     DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
     {
-        if (!resource_uav_state_)
+        if (!m_resource)
         {
             return DML_BUFFER_BINDING{};
         }
 
-        return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
+        return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_};
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index dee01a29fe55f..6c5cb37297caa 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -19,61 +19,39 @@ namespace Dml
         D3D12BufferRegion(
             uint64_t offset,
             uint64_t size_in_bytes,
-            ID3D12Resource* resource_uav_state,
-            ID3D12Resource* resource_copy_src_state,
-            ID3D12Resource* resource_copy_dst_state);
+            ID3D12Resource* resource);
 
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
         D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
         D3D12BufferRegion(D3D12BufferRegion&&);
         D3D12BufferRegion& operator=(D3D12BufferRegion&&);
-
         ID3D12Resource* ResourceInUavState() const;
 
-        // NOTE: may be any state that is valid as a copy source (COPY_SRC,
-        // GENERIC_READ, or COMMON).
-        ID3D12Resource* ResourceInCopySrcState() const;
-
-        ID3D12Resource* ResourceInCopyDstState() const;
-
         uint64_t Offset() const;
         uint64_t SizeInBytes() const;
 
         DML_BUFFER_BINDING GetBufferBinding() const;
 
-        explicit operator bool() const { return first_valid_resource_ != nullptr; }
+        explicit operator bool() const { return m_resource != nullptr; }
 
         // Creates a subregion at an offset from the start of this region. If no
         // size is provided the region runs to the end of the current region.
-        inline D3D12BufferRegion Subregion(
-            uint64_t offset,
-            uint64_t size_in_bytes = 0) const
+        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const
         {
             // start of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
-            size_in_bytes =
-                size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
             // end of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
 
-            return D3D12BufferRegion(
-                offset_ + offset,
-                size_in_bytes,
-                resource_uav_state_,
-                resource_copy_src_state_,
-                resource_copy_dst_state_);
+            return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource);
         }
 
     private:
-        ID3D12Resource* resource_uav_state_ = nullptr;
-        ID3D12Resource* resource_copy_src_state_ = nullptr;
-        ID3D12Resource* resource_copy_dst_state_ = nullptr;
+        ID3D12Resource* m_resource = nullptr;
         uint64_t offset_ = 0;
         uint64_t size_in_bytes_ = 0;
-
-        // Pointer to the first resource above that isn't null.
-        ID3D12Resource* first_valid_resource_ = nullptr;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index af625334b7720..862884c22b08c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -94,10 +94,8 @@ void DmlCommandRecorder::InitializeOperator(
     if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) ||
         (temporaryResourceSize > 0))
     {
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
+        auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+        m_currentCommandList->ResourceBarrier(1, &uav);
     }
 }
 
@@ -156,13 +154,8 @@ void DmlCommandRecorder::ExecuteOperator(
     // Barrier all outputs.
     #pragma warning(push)
     #pragma warning(disable: 6387)
-
-    // Barrier all outputs.
-    D3D12_RESOURCE_BARRIER barriers[] = {
-        CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-        CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-    m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
-
+    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+    m_currentCommandList->ResourceBarrier(1, &uav);
     #pragma warning(pop)
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index f786cca837f06..4b9c167dfe671 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -12,12 +12,6 @@ namespace Dml
 
         // Committed resources use the same resource for all states and use barriers to transition between states
         ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index cd6b241e70d48..dcf6b8607f319 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
             *allocId = 0;
-            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
         }
 
         auto taggedPointer = TaggedPointer::Unpack(opaqueData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index c82e0a4f5d722..0dc07384ea905 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -400,9 +400,7 @@ namespace Dml
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
-            it->second->GetUavResource(),
-            it->second->GetCopySrcResource(),
-            it->second->GetCopyDstResource());
+            it->second->GetUavResource());
     }
 
     AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 1d7c8704ab7da..8049848c8671e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -60,9 +60,7 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(
-            const TaggedPointer& taggedPointer,
-            uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
 
         AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 68feab568ca45..22f8cbbdc394b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -18,12 +18,6 @@ namespace Dml
         }
 
         ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
-        ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
-        ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; }
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 03e9f762b7eb4..2b1a8e5c726dc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,12 +11,8 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
+        // TODO (pavignol): Rename to GetResource()
         virtual ID3D12Resource* GetUavResource() const = 0;
-        virtual ID3D12Resource* GetCopySrcResource() const = 0;
-        virtual ID3D12Resource* GetCopyDstResource() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index d6a46e354c769..b5492a1a86ea3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -100,8 +100,6 @@ namespace Dml
             barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
         }
 
-        // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 
         // Keep the intermediate buffer alive until we're done with it
@@ -135,7 +133,6 @@ namespace Dml
 
         // Since this copy may write to GPU memory, we also need to perform an
         // aliasing barrier
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 */
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index f72035f5e5fda..b06d23adf5886 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -443,15 +443,8 @@ namespace Dml
             // CPU -> GPU copy (upload)
             //
             auto dstBufferRegion = GetBufferForTensor(dst);
-
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? dstBufferRegion.ResourceInUavState()
-                : dstBufferRegion.ResourceInCopyDstState();
-
-            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
-
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
             FlushUploadsIfReady();
@@ -462,47 +455,26 @@ namespace Dml
             // GPU -> CPU copy (readback)
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
-
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t srcOffset = srcBufferRegion.Offset();
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
-            printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
-
             //
             // GPU -> GPU copy
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            const uint64_t srcOffset = srcBufferRegion.Offset();
 
             auto dstBufferRegion = GetBufferForTensor(dst);
-
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? dstBufferRegion.ResourceInUavState()
-                : dstBufferRegion.ResourceInCopyDstState();
-
-            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
-
-            const uint64_t srcOffset = srcBufferRegion.Offset();
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
+
             m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes);
         }
         else
@@ -524,7 +496,6 @@ namespace Dml
 
         // Source and destination for batched GPU -> CPU copies
         std::vector<ID3D12Resource*> srcDatas;
-        std::vector<D3D12_RESOURCE_STATES> srcStates;
         std::vector<uint64_t> srcOffsets;
         std::vector<void*> dstDatas;
         std::vector<uint32_t> dataSizesInBytes;
@@ -557,21 +528,16 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(src[i]);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
-            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
+        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
 
         return S_OK;
         }
@@ -941,9 +907,6 @@ namespace Dml
         std::vector<ID3D12Resource*> srcDatas;
         srcDatas.reserve(src_dst_pairs.size());
 
-        std::vector<D3D12_RESOURCE_STATES> srcStates;
-        srcStates.reserve(src_dst_pairs.size());
-
         std::vector<uint64_t> srcOffsets;
         srcOffsets.reserve(src_dst_pairs.size());
 
@@ -993,21 +956,16 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
-            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
+        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
 
         return onnxruntime::common::Status::OK();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 35955b113b2c1..b00b8f8e19f52 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -124,9 +124,7 @@ namespace Dml
                         inputBufferRegions[i] = D3D12BufferRegion(
                             0,
                             m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width,
-                            m_nonOwnedGraphInputsFromInitializers[i].Get(),
-                            nullptr,
-                            nullptr);
+                            m_nonOwnedGraphInputsFromInitializers[i].Get());
                     }
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 1e7ebdc234c22..dde290f0bce0f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter
         if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
         {
             auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
-            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
         }
 
         auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
@@ -1716,8 +1716,6 @@ namespace Windows::AI::MachineLearning::Adapter
         }
     }
 
-    // TODO (pavignol): Fix once we go back to a single resource
-    /*
     void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp)
     {
         if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
@@ -1769,9 +1767,9 @@ namespace Windows::AI::MachineLearning::Adapter
                 }
             }
 
-            for (auto& tempAlloc : m_temporaryAbiAllocations)
+            for (auto& tempBuffer : m_temporaryBuffers)
             {
-                resourcesToTransition.push_back(tempAlloc.Get());
+                resourcesToTransition.push_back(tempBuffer.ResourceInUavState());
             }
 
             m_winmlProvider->TransitionResourcesForOperator(
@@ -1780,7 +1778,6 @@ namespace Windows::AI::MachineLearning::Adapter
                 resourcesToTransition.data());
         }
     }
-    */
 
     OpKernelContextWrapper::OpKernelContextWrapper(
         onnxruntime::OpKernelContext* context,
@@ -1828,13 +1825,10 @@ namespace Windows::AI::MachineLearning::Adapter
 
     void OpKernelContextWrapper::Close()
     {
-        // TODO (pavignol): Fix once we go back to a single resource
-        /*
         if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
         {
             TransitionResourcesForOperatorIfRequired(false);
         }
-        */
 
         for (auto& tensors : m_inputTensors)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 647e0a17d26df..85b6b197fe511 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -512,9 +512,7 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext, IMLOpe
 
  protected:
     void ClearTempAllocations();
-
-    // TODO (pavignol): Fix once we go back to a single resource
-    // void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
+    void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
 
     // Lifetime is managed by the caller and guaranteed to outlive this class
     onnxruntime::OpKernelContext* m_impl = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index ed1a6ebe49171..1e3035648adcb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -727,9 +727,28 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams;
 
         // Get resources
+        auto inputBufferRegion =  bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion;
+        auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion;
         auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion;
         auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion;
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[2];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(2, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get());
         commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get());
@@ -764,6 +783,21 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         chirpLength *= (m_isInverse ? 1 : -1);
         float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f;
         StockhamFFT(fft_params, true,  chirpLength, scale, commandList);
+
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(2, barriers);
     }
 
     void StockhamFFT(
@@ -779,8 +813,27 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& loopList = stockhamParams.ResourceLoopList;
 
         // Get input and output resources
+        auto inputBufferRegion = loopList[0].BufferRegion;
+        auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion;
         auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion;
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[2];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(2, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get());
         commandList->SetPipelineState(m_stockhamFFTPipelineState.Get());
@@ -822,6 +875,21 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             std::array<Dml::D3D12BufferRegion, 3> uav_resources = { in, out, window };
             Dispatch(uav_resources, constants, commandList);
         }
+
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -838,6 +906,14 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
+
+        std::transform(
+            bufferRegions.begin(), bufferRegions.end(),
+            uav_barriers,
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+        commandList->ResourceBarrier(TSize, uav_barriers);
+
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -886,10 +962,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        commandList->ResourceBarrier(TSize, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 8863bd5362d27..0611c4b7bf7f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -683,6 +683,29 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         Dml::GetDescendingPackedStrides(gridDims, gridStrides);
         Dml::GetDescendingPackedStrides(outputDims, outputStrides);
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[3];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            gridBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(3, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get());
         commandList->SetPipelineState(m_gridSamplePipelineState.Get());
@@ -704,10 +727,26 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::array<Dml::D3D12BufferRegion, 3> uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion };
         Dispatch(uavBufferRegions, constants, commandList);
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            gridBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(3, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -724,6 +763,14 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
+
+        std::transform(
+            bufferRegions.begin(), bufferRegions.end(),
+            uav_barriers,
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+        commandList->ResourceBarrier(TSize, uav_barriers);
+
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -772,10 +819,7 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        commandList->ResourceBarrier(TSize, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index cd1f78e2a23a6..945b58965cf2f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -405,9 +405,15 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         std::array<DML_BINDING_DESC, 2> inputBindings;
         uint32_t inputBindingsCount = 1;
 
+        // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking
+        // barrierCount is outside the valid range.
+        D3D12_RESOURCE_BARRIER barriers[3];
+        uint32_t barrierCount = 0;
+
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         Dml::D3D12BufferRegion windowBufferRegion;
         if (m_framingOperator.hasWindowTensor)
@@ -415,6 +421,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window);
             inputBuffers[1] = windowBufferRegion.GetBufferBinding();
             inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] };
+            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
             inputBindingsCount++;
         }
 
@@ -422,6 +429,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding();
         DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer };
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         m_framingOperator.bindingTable->BindOutputs(1, &outputBinding);
 
@@ -443,16 +451,22 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc);
         }
 
+        // Transition resources COMMON -> UAV
+        commandList->ResourceBarrier(barrierCount, barriers);
+
         m_framingOperator.commandRecorder->RecordDispatch(
             commandList,
             m_framingOperator.op.Get(),
             m_framingOperator.bindingTable.Get()
         );
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        // Transition resources UAV -> COMMON
+        for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++)
+        {
+            std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter);
+        }
+
+        commandList->ResourceBarrier(barrierCount, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index a91886c3b5863..5bb04ba4d30b5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -106,7 +106,7 @@ namespace Dml
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
         gsl::span<uint64_t> srcOffsets,
-        gsl::span<const D3D12_RESOURCE_STATES> srcStates)
+        D3D12_RESOURCE_STATES srcState)
     {
         assert(dst.size() == src.size());
         assert(dstSizes.size() == src.size());
@@ -134,7 +134,7 @@ namespace Dml
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
                 srcOffsets[i],
-                srcStates[i],
+                srcState,
                 dstSizes[i]);
 
             offset += dstSizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index f888f0a55ac48..4a65ce899d791 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -28,7 +28,7 @@ namespace Dml
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
             gsl::span<uint64_t> srcOffsets,
-            gsl::span<const D3D12_RESOURCE_STATES> srcStates);
+            D3D12_RESOURCE_STATES srcState);
 
     private:
         void EnsureReadbackHeap(size_t size);

From 568e55039044980bffc4b3776ad73446dc322792 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 11 Jul 2023 22:07:30 -0700
Subject: [PATCH 42/76] Revert "Remove aliasing"

This reverts commit e0569c593e11ff577c27f8b1d620ac0b5ab919b3.
---
 .../src/DmlAllocationInfo.h                   | 10 +++
 .../DmlExecutionProvider/src/DmlBuffer.cpp    | 10 +++
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  3 +-
 .../src/DmlBufferRegion.cpp                   | 76 ++++++++++++++---
 .../src/DmlBufferRegion.h                     | 34 ++++++--
 .../src/DmlCommandRecorder.cpp                | 15 +++-
 .../src/DmlCommittedResourceWrapper.h         |  6 ++
 .../src/DmlGraphFusionHelper.cpp              |  2 +-
 .../src/DmlReservedResourceSubAllocator.cpp   |  4 +-
 .../src/DmlReservedResourceSubAllocator.h     |  4 +-
 .../src/DmlReservedResourceWrapper.h          |  6 ++
 .../src/DmlResourceWrapper.h                  |  6 +-
 .../src/ExecutionContext.cpp                  |  3 +
 .../src/ExecutionProvider.cpp                 | 78 +++++++++++++-----
 .../src/FusedGraphKernel.cpp                  |  4 +-
 .../src/MLOperatorAuthorImpl.cpp              | 12 ++-
 .../src/MLOperatorAuthorImpl.h                |  4 +-
 .../src/Operators/DmlDFT.h                    | 81 +------------------
 .../src/Operators/DmlGridSample.h             | 60 ++------------
 .../src/Operators/DmlSTFT.h                   | 22 +----
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  4 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |  2 +-
 22 files changed, 245 insertions(+), 201 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 7c11358bb106d..546a42342a2a0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -36,6 +36,16 @@ namespace Dml
             return m_resourceWrapper->GetUavResource();
         }
 
+        ID3D12Resource* GetCopySrcResource() const
+        {
+            return m_resourceWrapper->GetCopySrcResource();
+        }
+
+        ID3D12Resource* GetCopyDstResource() const
+        {
+            return m_resourceWrapper->GetCopyDstResource();
+        }
+
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
         {
             return std::move(m_resourceWrapper);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index 464ce26c16f54..c5fa576d24a0f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -48,6 +48,16 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const
     return buffer_region_.ResourceInUavState();
 }
 
+ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const
+{
+    return buffer_region_.ResourceInCopySrcState();
+}
+
+ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const
+{
+    return buffer_region_.ResourceInCopyDstState();
+}
+
 uint64_t DmlBuffer::Offset() const
 {
     return buffer_region_ ? buffer_region_.Offset() : 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index 4b0dd58ce4467..b98ae727e1a65 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -27,8 +27,9 @@ class DmlBuffer
     DmlBuffer(DmlBuffer&&);
     DmlBuffer& operator=(DmlBuffer&&);
 
-    // TODO (pavignol): Rename to Resource()
     ID3D12Resource* ResourceInUavState() const;
+    ID3D12Resource* ResourceInCopySrcState() const;
+    ID3D12Resource* ResourceInCopyDstState() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
     const D3D12BufferRegion& Region() const { return buffer_region_; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index c33cc5491c7f0..3240042b5b6a6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -7,64 +7,114 @@
 namespace Dml
 {
 
-    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource)
-        : m_resource(resource),
+    D3D12BufferRegion::D3D12BufferRegion(
+        uint64_t offset,
+        uint64_t size_in_bytes,
+        ID3D12Resource* resource_uav_state,
+        ID3D12Resource* resource_copy_src_state,
+        ID3D12Resource* resource_copy_dst_state)
+        : resource_uav_state_(resource_uav_state),
+        resource_copy_src_state_(resource_copy_src_state),
+        resource_copy_dst_state_(resource_copy_dst_state),
         offset_(offset),
         size_in_bytes_(size_in_bytes)
     {
-        ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr);
+        // Get a raw pointer to the first non-null resource passed in. At least one
+        // resource must be provided.
+        first_valid_resource_ = resource_uav_state_;
+        if (!first_valid_resource_)
+        {
+            first_valid_resource_ = resource_copy_src_state_;
+        }
+        if (!first_valid_resource_)
+        {
+            first_valid_resource_ = resource_copy_dst_state_;
+        }
+        ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr);
 
         // Regions cannot be empty.
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
 
         // Regions cannot extend beyond the size of the resource.
-        uint64_t buffer_size = m_resource->GetDesc().Width;
+        uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
         ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
 
         // All three resources, if provided, must be identical aside from state.
-        assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
-        assert(m_resource->GetDesc().Width == buffer_size);
+        assert(
+            first_valid_resource_->GetDesc().Dimension ==
+            D3D12_RESOURCE_DIMENSION_BUFFER);
+        assert(
+            !resource_uav_state ||
+            (resource_uav_state->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_uav_state->GetDesc().Width == buffer_size));
+        assert(
+            !resource_copy_src_state_ ||
+            (resource_copy_src_state_->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_copy_src_state_->GetDesc().Width == buffer_size));
+        assert(
+            !resource_copy_dst_state_ ||
+            (resource_copy_dst_state_->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_copy_dst_state_->GetDesc().Width == buffer_size));
     }
 
     D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
     {
-        std::swap(this->m_resource, that.m_resource);
+        std::swap(this->resource_uav_state_, that.resource_uav_state_);
+        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->first_valid_resource_, that.first_valid_resource_);
     }
 
     D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
     {
-        std::swap(this->m_resource, that.m_resource);
+        std::swap(this->resource_uav_state_, that.resource_uav_state_);
+        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->first_valid_resource_, that.first_valid_resource_);
         return *this;
     }
 
     ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
     {
-        return m_resource;
+        return resource_uav_state_;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const
+    {
+        return resource_copy_src_state_;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const
+    {
+        return resource_copy_dst_state_;
     }
 
     uint64_t D3D12BufferRegion::Offset() const
     {
-        return m_resource ? offset_ : 0;
+        return first_valid_resource_ ? offset_ : 0;
     }
 
     uint64_t D3D12BufferRegion::SizeInBytes() const
     {
-        return m_resource ? size_in_bytes_ : 0;
+        return first_valid_resource_ ? size_in_bytes_ : 0;
     }
 
     DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
     {
-        if (!m_resource)
+        if (!resource_uav_state_)
         {
             return DML_BUFFER_BINDING{};
         }
 
-        return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_};
+        return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index 6c5cb37297caa..dee01a29fe55f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -19,39 +19,61 @@ namespace Dml
         D3D12BufferRegion(
             uint64_t offset,
             uint64_t size_in_bytes,
-            ID3D12Resource* resource);
+            ID3D12Resource* resource_uav_state,
+            ID3D12Resource* resource_copy_src_state,
+            ID3D12Resource* resource_copy_dst_state);
 
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
         D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
         D3D12BufferRegion(D3D12BufferRegion&&);
         D3D12BufferRegion& operator=(D3D12BufferRegion&&);
+
         ID3D12Resource* ResourceInUavState() const;
 
+        // NOTE: may be any state that is valid as a copy source (COPY_SRC,
+        // GENERIC_READ, or COMMON).
+        ID3D12Resource* ResourceInCopySrcState() const;
+
+        ID3D12Resource* ResourceInCopyDstState() const;
+
         uint64_t Offset() const;
         uint64_t SizeInBytes() const;
 
         DML_BUFFER_BINDING GetBufferBinding() const;
 
-        explicit operator bool() const { return m_resource != nullptr; }
+        explicit operator bool() const { return first_valid_resource_ != nullptr; }
 
         // Creates a subregion at an offset from the start of this region. If no
         // size is provided the region runs to the end of the current region.
-        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const
+        inline D3D12BufferRegion Subregion(
+            uint64_t offset,
+            uint64_t size_in_bytes = 0) const
         {
             // start of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
-            size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            size_in_bytes =
+                size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
             // end of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
 
-            return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource);
+            return D3D12BufferRegion(
+                offset_ + offset,
+                size_in_bytes,
+                resource_uav_state_,
+                resource_copy_src_state_,
+                resource_copy_dst_state_);
         }
 
     private:
-        ID3D12Resource* m_resource = nullptr;
+        ID3D12Resource* resource_uav_state_ = nullptr;
+        ID3D12Resource* resource_copy_src_state_ = nullptr;
+        ID3D12Resource* resource_copy_dst_state_ = nullptr;
         uint64_t offset_ = 0;
         uint64_t size_in_bytes_ = 0;
+
+        // Pointer to the first resource above that isn't null.
+        ID3D12Resource* first_valid_resource_ = nullptr;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 862884c22b08c..af625334b7720 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -94,8 +94,10 @@ void DmlCommandRecorder::InitializeOperator(
     if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) ||
         (temporaryResourceSize > 0))
     {
-        auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
-        m_currentCommandList->ResourceBarrier(1, &uav);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
     }
 }
 
@@ -154,8 +156,13 @@ void DmlCommandRecorder::ExecuteOperator(
     // Barrier all outputs.
     #pragma warning(push)
     #pragma warning(disable: 6387)
-    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
-    m_currentCommandList->ResourceBarrier(1, &uav);
+
+    // Barrier all outputs.
+    D3D12_RESOURCE_BARRIER barriers[] = {
+        CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+        CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+    m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
+
     #pragma warning(pop)
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index 4b9c167dfe671..f786cca837f06 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -12,6 +12,12 @@ namespace Dml
 
         // Committed resources use the same resource for all states and use barriers to transition between states
         ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index dcf6b8607f319..cd6b241e70d48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
             *allocId = 0;
-            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
+            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
         }
 
         auto taggedPointer = TaggedPointer::Unpack(opaqueData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index 0dc07384ea905..c82e0a4f5d722 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -400,7 +400,9 @@ namespace Dml
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
-            it->second->GetUavResource());
+            it->second->GetUavResource(),
+            it->second->GetCopySrcResource(),
+            it->second->GetCopyDstResource());
     }
 
     AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 8049848c8671e..1d7c8704ab7da 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -60,7 +60,9 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(
+            const TaggedPointer& taggedPointer,
+            uint64_t size_in_bytes);
 
         AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 22f8cbbdc394b..68feab568ca45 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -18,6 +18,12 @@ namespace Dml
         }
 
         ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
+        ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
+        ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; }
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 2b1a8e5c726dc..03e9f762b7eb4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,8 +11,12 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
-        // TODO (pavignol): Rename to GetResource()
         virtual ID3D12Resource* GetUavResource() const = 0;
+        virtual ID3D12Resource* GetCopySrcResource() const = 0;
+        virtual ID3D12Resource* GetCopyDstResource() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index b5492a1a86ea3..d6a46e354c769 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -100,6 +100,8 @@ namespace Dml
             barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
         }
 
+        // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 
         // Keep the intermediate buffer alive until we're done with it
@@ -133,6 +135,7 @@ namespace Dml
 
         // Since this copy may write to GPU memory, we also need to perform an
         // aliasing barrier
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 */
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index b06d23adf5886..f72035f5e5fda 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -443,8 +443,15 @@ namespace Dml
             // CPU -> GPU copy (upload)
             //
             auto dstBufferRegion = GetBufferForTensor(dst);
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
-            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? dstBufferRegion.ResourceInUavState()
+                : dstBufferRegion.ResourceInCopyDstState();
+
+            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
+
             const uint64_t dstOffset = dstBufferRegion.Offset();
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
             FlushUploadsIfReady();
@@ -455,26 +462,47 @@ namespace Dml
             // GPU -> CPU copy (readback)
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+
             const uint64_t srcOffset = srcBufferRegion.Offset();
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
+            printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
+
             //
             // GPU -> GPU copy
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-            const uint64_t srcOffset = srcBufferRegion.Offset();
+
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             auto dstBufferRegion = GetBufferForTensor(dst);
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
-            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-            const uint64_t dstOffset = dstBufferRegion.Offset();
 
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? dstBufferRegion.ResourceInUavState()
+                : dstBufferRegion.ResourceInCopyDstState();
+
+            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
+
+            const uint64_t srcOffset = srcBufferRegion.Offset();
+            const uint64_t dstOffset = dstBufferRegion.Offset();
             m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes);
         }
         else
@@ -496,6 +524,7 @@ namespace Dml
 
         // Source and destination for batched GPU -> CPU copies
         std::vector<ID3D12Resource*> srcDatas;
+        std::vector<D3D12_RESOURCE_STATES> srcStates;
         std::vector<uint64_t> srcOffsets;
         std::vector<void*> dstDatas;
         std::vector<uint32_t> dataSizesInBytes;
@@ -528,16 +557,21 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(src[i]);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             srcDatas.push_back(srcData);
+            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
 
         return S_OK;
         }
@@ -907,6 +941,9 @@ namespace Dml
         std::vector<ID3D12Resource*> srcDatas;
         srcDatas.reserve(src_dst_pairs.size());
 
+        std::vector<D3D12_RESOURCE_STATES> srcStates;
+        srcStates.reserve(src_dst_pairs.size());
+
         std::vector<uint64_t> srcOffsets;
         srcOffsets.reserve(src_dst_pairs.size());
 
@@ -956,16 +993,21 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             srcDatas.push_back(srcData);
+            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
 
         return onnxruntime::common::Status::OK();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index b00b8f8e19f52..35955b113b2c1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -124,7 +124,9 @@ namespace Dml
                         inputBufferRegions[i] = D3D12BufferRegion(
                             0,
                             m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width,
-                            m_nonOwnedGraphInputsFromInitializers[i].Get());
+                            m_nonOwnedGraphInputsFromInitializers[i].Get(),
+                            nullptr,
+                            nullptr);
                     }
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index dde290f0bce0f..1e7ebdc234c22 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter
         if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
         {
             auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
-            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
+            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
         }
 
         auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
@@ -1716,6 +1716,8 @@ namespace Windows::AI::MachineLearning::Adapter
         }
     }
 
+    // TODO (pavignol): Fix once we go back to a single resource
+    /*
     void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp)
     {
         if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
@@ -1767,9 +1769,9 @@ namespace Windows::AI::MachineLearning::Adapter
                 }
             }
 
-            for (auto& tempBuffer : m_temporaryBuffers)
+            for (auto& tempAlloc : m_temporaryAbiAllocations)
             {
-                resourcesToTransition.push_back(tempBuffer.ResourceInUavState());
+                resourcesToTransition.push_back(tempAlloc.Get());
             }
 
             m_winmlProvider->TransitionResourcesForOperator(
@@ -1778,6 +1780,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 resourcesToTransition.data());
         }
     }
+    */
 
     OpKernelContextWrapper::OpKernelContextWrapper(
         onnxruntime::OpKernelContext* context,
@@ -1825,10 +1828,13 @@ namespace Windows::AI::MachineLearning::Adapter
 
     void OpKernelContextWrapper::Close()
     {
+        // TODO (pavignol): Fix once we go back to a single resource
+        /*
         if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
         {
             TransitionResourcesForOperatorIfRequired(false);
         }
+        */
 
         for (auto& tensors : m_inputTensors)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 85b6b197fe511..647e0a17d26df 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -512,7 +512,9 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext, IMLOpe
 
  protected:
     void ClearTempAllocations();
-    void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
+
+    // TODO (pavignol): Fix once we go back to a single resource
+    // void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
 
     // Lifetime is managed by the caller and guaranteed to outlive this class
     onnxruntime::OpKernelContext* m_impl = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 1e3035648adcb..ed1a6ebe49171 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -727,28 +727,9 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams;
 
         // Get resources
-        auto inputBufferRegion =  bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion;
-        auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion;
         auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion;
         auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion;
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[2];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(2, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get());
         commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get());
@@ -783,21 +764,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         chirpLength *= (m_isInverse ? 1 : -1);
         float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f;
         StockhamFFT(fft_params, true,  chirpLength, scale, commandList);
-
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        commandList->ResourceBarrier(2, barriers);
     }
 
     void StockhamFFT(
@@ -813,27 +779,8 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& loopList = stockhamParams.ResourceLoopList;
 
         // Get input and output resources
-        auto inputBufferRegion = loopList[0].BufferRegion;
-        auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion;
         auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion;
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[2];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(2, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get());
         commandList->SetPipelineState(m_stockhamFFTPipelineState.Get());
@@ -875,21 +822,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             std::array<Dml::D3D12BufferRegion, 3> uav_resources = { in, out, window };
             Dispatch(uav_resources, constants, commandList);
         }
-
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -906,14 +838,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
-
-        std::transform(
-            bufferRegions.begin(), bufferRegions.end(),
-            uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
-        commandList->ResourceBarrier(TSize, uav_barriers);
-
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -962,7 +886,10 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(TSize, uav_barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 0611c4b7bf7f7..8863bd5362d27 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -683,29 +683,6 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         Dml::GetDescendingPackedStrides(gridDims, gridStrides);
         Dml::GetDescendingPackedStrides(outputDims, outputStrides);
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[3];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(3, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get());
         commandList->SetPipelineState(m_gridSamplePipelineState.Get());
@@ -727,26 +704,10 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::array<Dml::D3D12BufferRegion, 3> uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion };
         Dispatch(uavBufferRegions, constants, commandList);
 
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        commandList->ResourceBarrier(3, barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -763,14 +724,6 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
-
-        std::transform(
-            bufferRegions.begin(), bufferRegions.end(),
-            uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
-        commandList->ResourceBarrier(TSize, uav_barriers);
-
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -819,7 +772,10 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(TSize, uav_barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index 945b58965cf2f..cd1f78e2a23a6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -405,15 +405,9 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         std::array<DML_BINDING_DESC, 2> inputBindings;
         uint32_t inputBindingsCount = 1;
 
-        // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking
-        // barrierCount is outside the valid range.
-        D3D12_RESOURCE_BARRIER barriers[3];
-        uint32_t barrierCount = 0;
-
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
-        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         Dml::D3D12BufferRegion windowBufferRegion;
         if (m_framingOperator.hasWindowTensor)
@@ -421,7 +415,6 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window);
             inputBuffers[1] = windowBufferRegion.GetBufferBinding();
             inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] };
-            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
             inputBindingsCount++;
         }
 
@@ -429,7 +422,6 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding();
         DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer };
-        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         m_framingOperator.bindingTable->BindOutputs(1, &outputBinding);
 
@@ -451,22 +443,16 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc);
         }
 
-        // Transition resources COMMON -> UAV
-        commandList->ResourceBarrier(barrierCount, barriers);
-
         m_framingOperator.commandRecorder->RecordDispatch(
             commandList,
             m_framingOperator.op.Get(),
             m_framingOperator.bindingTable.Get()
         );
 
-        // Transition resources UAV -> COMMON
-        for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++)
-        {
-            std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter);
-        }
-
-        commandList->ResourceBarrier(barrierCount, barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index 5bb04ba4d30b5..a91886c3b5863 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -106,7 +106,7 @@ namespace Dml
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
         gsl::span<uint64_t> srcOffsets,
-        D3D12_RESOURCE_STATES srcState)
+        gsl::span<const D3D12_RESOURCE_STATES> srcStates)
     {
         assert(dst.size() == src.size());
         assert(dstSizes.size() == src.size());
@@ -134,7 +134,7 @@ namespace Dml
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
                 srcOffsets[i],
-                srcState,
+                srcStates[i],
                 dstSizes[i]);
 
             offset += dstSizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index 4a65ce899d791..f888f0a55ac48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -28,7 +28,7 @@ namespace Dml
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
             gsl::span<uint64_t> srcOffsets,
-            D3D12_RESOURCE_STATES srcState);
+            gsl::span<const D3D12_RESOURCE_STATES> srcStates);
 
     private:
         void EnsureReadbackHeap(size_t size);

From 943ac58d7af0ceaa8aeee2c4f85f04d6bfcba1f8 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 12 Jul 2023 07:59:45 -0700
Subject: [PATCH 43/76] Re-add "Remove aliasing"

This reverts commit 568e55039044980bffc4b3776ad73446dc322792.
---
 .../src/DmlAllocationInfo.h                   | 10 ---
 .../DmlExecutionProvider/src/DmlBuffer.cpp    | 10 ---
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  3 +-
 .../src/DmlBufferRegion.cpp                   | 76 +++--------------
 .../src/DmlBufferRegion.h                     | 34 ++------
 .../src/DmlCommandRecorder.cpp                | 15 +---
 .../src/DmlCommittedResourceWrapper.h         |  6 --
 .../src/DmlGraphFusionHelper.cpp              |  2 +-
 .../src/DmlReservedResourceSubAllocator.cpp   |  4 +-
 .../src/DmlReservedResourceSubAllocator.h     |  4 +-
 .../src/DmlReservedResourceWrapper.h          |  6 --
 .../src/DmlResourceWrapper.h                  |  6 +-
 .../src/ExecutionContext.cpp                  |  3 -
 .../src/ExecutionProvider.cpp                 | 78 +++++-------------
 .../src/FusedGraphKernel.cpp                  |  4 +-
 .../src/MLOperatorAuthorImpl.cpp              | 12 +--
 .../src/MLOperatorAuthorImpl.h                |  4 +-
 .../src/Operators/DmlDFT.h                    | 81 ++++++++++++++++++-
 .../src/Operators/DmlGridSample.h             | 60 ++++++++++++--
 .../src/Operators/DmlSTFT.h                   | 22 ++++-
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  4 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |  2 +-
 22 files changed, 201 insertions(+), 245 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 546a42342a2a0..7c11358bb106d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -36,16 +36,6 @@ namespace Dml
             return m_resourceWrapper->GetUavResource();
         }
 
-        ID3D12Resource* GetCopySrcResource() const
-        {
-            return m_resourceWrapper->GetCopySrcResource();
-        }
-
-        ID3D12Resource* GetCopyDstResource() const
-        {
-            return m_resourceWrapper->GetCopyDstResource();
-        }
-
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
         {
             return std::move(m_resourceWrapper);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index c5fa576d24a0f..464ce26c16f54 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -48,16 +48,6 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const
     return buffer_region_.ResourceInUavState();
 }
 
-ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const
-{
-    return buffer_region_.ResourceInCopySrcState();
-}
-
-ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const
-{
-    return buffer_region_.ResourceInCopyDstState();
-}
-
 uint64_t DmlBuffer::Offset() const
 {
     return buffer_region_ ? buffer_region_.Offset() : 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index b98ae727e1a65..4b0dd58ce4467 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -27,9 +27,8 @@ class DmlBuffer
     DmlBuffer(DmlBuffer&&);
     DmlBuffer& operator=(DmlBuffer&&);
 
+    // TODO (pavignol): Rename to Resource()
     ID3D12Resource* ResourceInUavState() const;
-    ID3D12Resource* ResourceInCopySrcState() const;
-    ID3D12Resource* ResourceInCopyDstState() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
     const D3D12BufferRegion& Region() const { return buffer_region_; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index 3240042b5b6a6..c33cc5491c7f0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -7,114 +7,64 @@
 namespace Dml
 {
 
-    D3D12BufferRegion::D3D12BufferRegion(
-        uint64_t offset,
-        uint64_t size_in_bytes,
-        ID3D12Resource* resource_uav_state,
-        ID3D12Resource* resource_copy_src_state,
-        ID3D12Resource* resource_copy_dst_state)
-        : resource_uav_state_(resource_uav_state),
-        resource_copy_src_state_(resource_copy_src_state),
-        resource_copy_dst_state_(resource_copy_dst_state),
+    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource)
+        : m_resource(resource),
         offset_(offset),
         size_in_bytes_(size_in_bytes)
     {
-        // Get a raw pointer to the first non-null resource passed in. At least one
-        // resource must be provided.
-        first_valid_resource_ = resource_uav_state_;
-        if (!first_valid_resource_)
-        {
-            first_valid_resource_ = resource_copy_src_state_;
-        }
-        if (!first_valid_resource_)
-        {
-            first_valid_resource_ = resource_copy_dst_state_;
-        }
-        ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr);
+        ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr);
 
         // Regions cannot be empty.
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
 
         // Regions cannot extend beyond the size of the resource.
-        uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
+        uint64_t buffer_size = m_resource->GetDesc().Width;
         ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
 
         // All three resources, if provided, must be identical aside from state.
-        assert(
-            first_valid_resource_->GetDesc().Dimension ==
-            D3D12_RESOURCE_DIMENSION_BUFFER);
-        assert(
-            !resource_uav_state ||
-            (resource_uav_state->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_uav_state->GetDesc().Width == buffer_size));
-        assert(
-            !resource_copy_src_state_ ||
-            (resource_copy_src_state_->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_copy_src_state_->GetDesc().Width == buffer_size));
-        assert(
-            !resource_copy_dst_state_ ||
-            (resource_copy_dst_state_->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_copy_dst_state_->GetDesc().Width == buffer_size));
+        assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
+        assert(m_resource->GetDesc().Width == buffer_size);
     }
 
     D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
     {
-        std::swap(this->resource_uav_state_, that.resource_uav_state_);
-        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
-        std::swap(this->first_valid_resource_, that.first_valid_resource_);
     }
 
     D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
     {
-        std::swap(this->resource_uav_state_, that.resource_uav_state_);
-        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
-        std::swap(this->first_valid_resource_, that.first_valid_resource_);
         return *this;
     }
 
     ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
     {
-        return resource_uav_state_;
-    }
-
-    ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const
-    {
-        return resource_copy_src_state_;
-    }
-
-    ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const
-    {
-        return resource_copy_dst_state_;
+        return m_resource;
     }
 
     uint64_t D3D12BufferRegion::Offset() const
     {
-        return first_valid_resource_ ? offset_ : 0;
+        return m_resource ? offset_ : 0;
     }
 
     uint64_t D3D12BufferRegion::SizeInBytes() const
     {
-        return first_valid_resource_ ? size_in_bytes_ : 0;
+        return m_resource ? size_in_bytes_ : 0;
     }
 
     DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
     {
-        if (!resource_uav_state_)
+        if (!m_resource)
         {
             return DML_BUFFER_BINDING{};
         }
 
-        return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
+        return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_};
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index dee01a29fe55f..6c5cb37297caa 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -19,61 +19,39 @@ namespace Dml
         D3D12BufferRegion(
             uint64_t offset,
             uint64_t size_in_bytes,
-            ID3D12Resource* resource_uav_state,
-            ID3D12Resource* resource_copy_src_state,
-            ID3D12Resource* resource_copy_dst_state);
+            ID3D12Resource* resource);
 
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
         D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
         D3D12BufferRegion(D3D12BufferRegion&&);
         D3D12BufferRegion& operator=(D3D12BufferRegion&&);
-
         ID3D12Resource* ResourceInUavState() const;
 
-        // NOTE: may be any state that is valid as a copy source (COPY_SRC,
-        // GENERIC_READ, or COMMON).
-        ID3D12Resource* ResourceInCopySrcState() const;
-
-        ID3D12Resource* ResourceInCopyDstState() const;
-
         uint64_t Offset() const;
         uint64_t SizeInBytes() const;
 
         DML_BUFFER_BINDING GetBufferBinding() const;
 
-        explicit operator bool() const { return first_valid_resource_ != nullptr; }
+        explicit operator bool() const { return m_resource != nullptr; }
 
         // Creates a subregion at an offset from the start of this region. If no
         // size is provided the region runs to the end of the current region.
-        inline D3D12BufferRegion Subregion(
-            uint64_t offset,
-            uint64_t size_in_bytes = 0) const
+        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const
         {
             // start of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
-            size_in_bytes =
-                size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
             // end of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
 
-            return D3D12BufferRegion(
-                offset_ + offset,
-                size_in_bytes,
-                resource_uav_state_,
-                resource_copy_src_state_,
-                resource_copy_dst_state_);
+            return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource);
         }
 
     private:
-        ID3D12Resource* resource_uav_state_ = nullptr;
-        ID3D12Resource* resource_copy_src_state_ = nullptr;
-        ID3D12Resource* resource_copy_dst_state_ = nullptr;
+        ID3D12Resource* m_resource = nullptr;
         uint64_t offset_ = 0;
         uint64_t size_in_bytes_ = 0;
-
-        // Pointer to the first resource above that isn't null.
-        ID3D12Resource* first_valid_resource_ = nullptr;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index af625334b7720..862884c22b08c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -94,10 +94,8 @@ void DmlCommandRecorder::InitializeOperator(
     if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) ||
         (temporaryResourceSize > 0))
     {
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
+        auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+        m_currentCommandList->ResourceBarrier(1, &uav);
     }
 }
 
@@ -156,13 +154,8 @@ void DmlCommandRecorder::ExecuteOperator(
     // Barrier all outputs.
     #pragma warning(push)
     #pragma warning(disable: 6387)
-
-    // Barrier all outputs.
-    D3D12_RESOURCE_BARRIER barriers[] = {
-        CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-        CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-    m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
-
+    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+    m_currentCommandList->ResourceBarrier(1, &uav);
     #pragma warning(pop)
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index f786cca837f06..4b9c167dfe671 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -12,12 +12,6 @@ namespace Dml
 
         // Committed resources use the same resource for all states and use barriers to transition between states
         ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index cd6b241e70d48..dcf6b8607f319 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
             *allocId = 0;
-            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
         }
 
         auto taggedPointer = TaggedPointer::Unpack(opaqueData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index c82e0a4f5d722..0dc07384ea905 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -400,9 +400,7 @@ namespace Dml
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
-            it->second->GetUavResource(),
-            it->second->GetCopySrcResource(),
-            it->second->GetCopyDstResource());
+            it->second->GetUavResource());
     }
 
     AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 1d7c8704ab7da..8049848c8671e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -60,9 +60,7 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(
-            const TaggedPointer& taggedPointer,
-            uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
 
         AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 68feab568ca45..22f8cbbdc394b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -18,12 +18,6 @@ namespace Dml
         }
 
         ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
-        ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
-        ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; }
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 03e9f762b7eb4..2b1a8e5c726dc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,12 +11,8 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
+        // TODO (pavignol): Rename to GetResource()
         virtual ID3D12Resource* GetUavResource() const = 0;
-        virtual ID3D12Resource* GetCopySrcResource() const = 0;
-        virtual ID3D12Resource* GetCopyDstResource() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index d6a46e354c769..b5492a1a86ea3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -100,8 +100,6 @@ namespace Dml
             barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
         }
 
-        // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 
         // Keep the intermediate buffer alive until we're done with it
@@ -135,7 +133,6 @@ namespace Dml
 
         // Since this copy may write to GPU memory, we also need to perform an
         // aliasing barrier
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 */
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index f72035f5e5fda..b06d23adf5886 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -443,15 +443,8 @@ namespace Dml
             // CPU -> GPU copy (upload)
             //
             auto dstBufferRegion = GetBufferForTensor(dst);
-
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? dstBufferRegion.ResourceInUavState()
-                : dstBufferRegion.ResourceInCopyDstState();
-
-            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
-
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
             FlushUploadsIfReady();
@@ -462,47 +455,26 @@ namespace Dml
             // GPU -> CPU copy (readback)
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
-
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t srcOffset = srcBufferRegion.Offset();
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
-            printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
-
             //
             // GPU -> GPU copy
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            const uint64_t srcOffset = srcBufferRegion.Offset();
 
             auto dstBufferRegion = GetBufferForTensor(dst);
-
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? dstBufferRegion.ResourceInUavState()
-                : dstBufferRegion.ResourceInCopyDstState();
-
-            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
-
-            const uint64_t srcOffset = srcBufferRegion.Offset();
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
+
             m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes);
         }
         else
@@ -524,7 +496,6 @@ namespace Dml
 
         // Source and destination for batched GPU -> CPU copies
         std::vector<ID3D12Resource*> srcDatas;
-        std::vector<D3D12_RESOURCE_STATES> srcStates;
         std::vector<uint64_t> srcOffsets;
         std::vector<void*> dstDatas;
         std::vector<uint32_t> dataSizesInBytes;
@@ -557,21 +528,16 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(src[i]);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
-            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
+        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
 
         return S_OK;
         }
@@ -941,9 +907,6 @@ namespace Dml
         std::vector<ID3D12Resource*> srcDatas;
         srcDatas.reserve(src_dst_pairs.size());
 
-        std::vector<D3D12_RESOURCE_STATES> srcStates;
-        srcStates.reserve(src_dst_pairs.size());
-
         std::vector<uint64_t> srcOffsets;
         srcOffsets.reserve(src_dst_pairs.size());
 
@@ -993,21 +956,16 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
-            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
+        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
 
         return onnxruntime::common::Status::OK();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 35955b113b2c1..b00b8f8e19f52 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -124,9 +124,7 @@ namespace Dml
                         inputBufferRegions[i] = D3D12BufferRegion(
                             0,
                             m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width,
-                            m_nonOwnedGraphInputsFromInitializers[i].Get(),
-                            nullptr,
-                            nullptr);
+                            m_nonOwnedGraphInputsFromInitializers[i].Get());
                     }
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 1e7ebdc234c22..dde290f0bce0f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter
         if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
         {
             auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
-            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
         }
 
         auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
@@ -1716,8 +1716,6 @@ namespace Windows::AI::MachineLearning::Adapter
         }
     }
 
-    // TODO (pavignol): Fix once we go back to a single resource
-    /*
     void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp)
     {
         if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
@@ -1769,9 +1767,9 @@ namespace Windows::AI::MachineLearning::Adapter
                 }
             }
 
-            for (auto& tempAlloc : m_temporaryAbiAllocations)
+            for (auto& tempBuffer : m_temporaryBuffers)
             {
-                resourcesToTransition.push_back(tempAlloc.Get());
+                resourcesToTransition.push_back(tempBuffer.ResourceInUavState());
             }
 
             m_winmlProvider->TransitionResourcesForOperator(
@@ -1780,7 +1778,6 @@ namespace Windows::AI::MachineLearning::Adapter
                 resourcesToTransition.data());
         }
     }
-    */
 
     OpKernelContextWrapper::OpKernelContextWrapper(
         onnxruntime::OpKernelContext* context,
@@ -1828,13 +1825,10 @@ namespace Windows::AI::MachineLearning::Adapter
 
     void OpKernelContextWrapper::Close()
     {
-        // TODO (pavignol): Fix once we go back to a single resource
-        /*
         if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
         {
             TransitionResourcesForOperatorIfRequired(false);
         }
-        */
 
         for (auto& tensors : m_inputTensors)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 647e0a17d26df..85b6b197fe511 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -512,9 +512,7 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext, IMLOpe
 
  protected:
     void ClearTempAllocations();
-
-    // TODO (pavignol): Fix once we go back to a single resource
-    // void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
+    void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
 
     // Lifetime is managed by the caller and guaranteed to outlive this class
     onnxruntime::OpKernelContext* m_impl = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index ed1a6ebe49171..1e3035648adcb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -727,9 +727,28 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams;
 
         // Get resources
+        auto inputBufferRegion =  bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion;
+        auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion;
         auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion;
         auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion;
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[2];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(2, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get());
         commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get());
@@ -764,6 +783,21 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         chirpLength *= (m_isInverse ? 1 : -1);
         float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f;
         StockhamFFT(fft_params, true,  chirpLength, scale, commandList);
+
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(2, barriers);
     }
 
     void StockhamFFT(
@@ -779,8 +813,27 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& loopList = stockhamParams.ResourceLoopList;
 
         // Get input and output resources
+        auto inputBufferRegion = loopList[0].BufferRegion;
+        auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion;
         auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion;
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[2];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(2, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get());
         commandList->SetPipelineState(m_stockhamFFTPipelineState.Get());
@@ -822,6 +875,21 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             std::array<Dml::D3D12BufferRegion, 3> uav_resources = { in, out, window };
             Dispatch(uav_resources, constants, commandList);
         }
+
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -838,6 +906,14 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
+
+        std::transform(
+            bufferRegions.begin(), bufferRegions.end(),
+            uav_barriers,
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+        commandList->ResourceBarrier(TSize, uav_barriers);
+
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -886,10 +962,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        commandList->ResourceBarrier(TSize, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 8863bd5362d27..0611c4b7bf7f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -683,6 +683,29 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         Dml::GetDescendingPackedStrides(gridDims, gridStrides);
         Dml::GetDescendingPackedStrides(outputDims, outputStrides);
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[3];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            gridBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(3, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get());
         commandList->SetPipelineState(m_gridSamplePipelineState.Get());
@@ -704,10 +727,26 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::array<Dml::D3D12BufferRegion, 3> uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion };
         Dispatch(uavBufferRegions, constants, commandList);
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            gridBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(3, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -724,6 +763,14 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
+
+        std::transform(
+            bufferRegions.begin(), bufferRegions.end(),
+            uav_barriers,
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+        commandList->ResourceBarrier(TSize, uav_barriers);
+
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -772,10 +819,7 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        commandList->ResourceBarrier(TSize, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index cd1f78e2a23a6..945b58965cf2f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -405,9 +405,15 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         std::array<DML_BINDING_DESC, 2> inputBindings;
         uint32_t inputBindingsCount = 1;
 
+        // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking
+        // barrierCount is outside the valid range.
+        D3D12_RESOURCE_BARRIER barriers[3];
+        uint32_t barrierCount = 0;
+
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         Dml::D3D12BufferRegion windowBufferRegion;
         if (m_framingOperator.hasWindowTensor)
@@ -415,6 +421,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window);
             inputBuffers[1] = windowBufferRegion.GetBufferBinding();
             inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] };
+            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
             inputBindingsCount++;
         }
 
@@ -422,6 +429,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding();
         DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer };
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         m_framingOperator.bindingTable->BindOutputs(1, &outputBinding);
 
@@ -443,16 +451,22 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc);
         }
 
+        // Transition resources COMMON -> UAV
+        commandList->ResourceBarrier(barrierCount, barriers);
+
         m_framingOperator.commandRecorder->RecordDispatch(
             commandList,
             m_framingOperator.op.Get(),
             m_framingOperator.bindingTable.Get()
         );
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        // Transition resources UAV -> COMMON
+        for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++)
+        {
+            std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter);
+        }
+
+        commandList->ResourceBarrier(barrierCount, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index a91886c3b5863..5bb04ba4d30b5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -106,7 +106,7 @@ namespace Dml
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
         gsl::span<uint64_t> srcOffsets,
-        gsl::span<const D3D12_RESOURCE_STATES> srcStates)
+        D3D12_RESOURCE_STATES srcState)
     {
         assert(dst.size() == src.size());
         assert(dstSizes.size() == src.size());
@@ -134,7 +134,7 @@ namespace Dml
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
                 srcOffsets[i],
-                srcStates[i],
+                srcState,
                 dstSizes[i]);
 
             offset += dstSizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index f888f0a55ac48..4a65ce899d791 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -28,7 +28,7 @@ namespace Dml
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
             gsl::span<uint64_t> srcOffsets,
-            gsl::span<const D3D12_RESOURCE_STATES> srcStates);
+            D3D12_RESOURCE_STATES srcState);
 
     private:
         void EnsureReadbackHeap(size_t size);

From 587489d84711cdc47da2b2cda5952fe9bd2cb3c7 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 12 Jul 2023 08:18:16 -0700
Subject: [PATCH 44/76] Revert "Re-add "Remove aliasing""

This reverts commit 943ac58d7af0ceaa8aeee2c4f85f04d6bfcba1f8.
---
 .../src/DmlAllocationInfo.h                   | 10 +++
 .../DmlExecutionProvider/src/DmlBuffer.cpp    | 10 +++
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  3 +-
 .../src/DmlBufferRegion.cpp                   | 76 ++++++++++++++---
 .../src/DmlBufferRegion.h                     | 34 ++++++--
 .../src/DmlCommandRecorder.cpp                | 15 +++-
 .../src/DmlCommittedResourceWrapper.h         |  6 ++
 .../src/DmlGraphFusionHelper.cpp              |  2 +-
 .../src/DmlReservedResourceSubAllocator.cpp   |  4 +-
 .../src/DmlReservedResourceSubAllocator.h     |  4 +-
 .../src/DmlReservedResourceWrapper.h          |  6 ++
 .../src/DmlResourceWrapper.h                  |  6 +-
 .../src/ExecutionContext.cpp                  |  3 +
 .../src/ExecutionProvider.cpp                 | 78 +++++++++++++-----
 .../src/FusedGraphKernel.cpp                  |  4 +-
 .../src/MLOperatorAuthorImpl.cpp              | 12 ++-
 .../src/MLOperatorAuthorImpl.h                |  4 +-
 .../src/Operators/DmlDFT.h                    | 81 +------------------
 .../src/Operators/DmlGridSample.h             | 60 ++------------
 .../src/Operators/DmlSTFT.h                   | 22 +----
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  4 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |  2 +-
 22 files changed, 245 insertions(+), 201 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 7c11358bb106d..546a42342a2a0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -36,6 +36,16 @@ namespace Dml
             return m_resourceWrapper->GetUavResource();
         }
 
+        ID3D12Resource* GetCopySrcResource() const
+        {
+            return m_resourceWrapper->GetCopySrcResource();
+        }
+
+        ID3D12Resource* GetCopyDstResource() const
+        {
+            return m_resourceWrapper->GetCopyDstResource();
+        }
+
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
         {
             return std::move(m_resourceWrapper);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index 464ce26c16f54..c5fa576d24a0f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -48,6 +48,16 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const
     return buffer_region_.ResourceInUavState();
 }
 
+ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const
+{
+    return buffer_region_.ResourceInCopySrcState();
+}
+
+ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const
+{
+    return buffer_region_.ResourceInCopyDstState();
+}
+
 uint64_t DmlBuffer::Offset() const
 {
     return buffer_region_ ? buffer_region_.Offset() : 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index 4b0dd58ce4467..b98ae727e1a65 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -27,8 +27,9 @@ class DmlBuffer
     DmlBuffer(DmlBuffer&&);
     DmlBuffer& operator=(DmlBuffer&&);
 
-    // TODO (pavignol): Rename to Resource()
     ID3D12Resource* ResourceInUavState() const;
+    ID3D12Resource* ResourceInCopySrcState() const;
+    ID3D12Resource* ResourceInCopyDstState() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
     const D3D12BufferRegion& Region() const { return buffer_region_; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index c33cc5491c7f0..3240042b5b6a6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -7,64 +7,114 @@
 namespace Dml
 {
 
-    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource)
-        : m_resource(resource),
+    D3D12BufferRegion::D3D12BufferRegion(
+        uint64_t offset,
+        uint64_t size_in_bytes,
+        ID3D12Resource* resource_uav_state,
+        ID3D12Resource* resource_copy_src_state,
+        ID3D12Resource* resource_copy_dst_state)
+        : resource_uav_state_(resource_uav_state),
+        resource_copy_src_state_(resource_copy_src_state),
+        resource_copy_dst_state_(resource_copy_dst_state),
         offset_(offset),
         size_in_bytes_(size_in_bytes)
     {
-        ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr);
+        // Get a raw pointer to the first non-null resource passed in. At least one
+        // resource must be provided.
+        first_valid_resource_ = resource_uav_state_;
+        if (!first_valid_resource_)
+        {
+            first_valid_resource_ = resource_copy_src_state_;
+        }
+        if (!first_valid_resource_)
+        {
+            first_valid_resource_ = resource_copy_dst_state_;
+        }
+        ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr);
 
         // Regions cannot be empty.
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
 
         // Regions cannot extend beyond the size of the resource.
-        uint64_t buffer_size = m_resource->GetDesc().Width;
+        uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
         ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
 
         // All three resources, if provided, must be identical aside from state.
-        assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
-        assert(m_resource->GetDesc().Width == buffer_size);
+        assert(
+            first_valid_resource_->GetDesc().Dimension ==
+            D3D12_RESOURCE_DIMENSION_BUFFER);
+        assert(
+            !resource_uav_state ||
+            (resource_uav_state->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_uav_state->GetDesc().Width == buffer_size));
+        assert(
+            !resource_copy_src_state_ ||
+            (resource_copy_src_state_->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_copy_src_state_->GetDesc().Width == buffer_size));
+        assert(
+            !resource_copy_dst_state_ ||
+            (resource_copy_dst_state_->GetDesc().Dimension ==
+                D3D12_RESOURCE_DIMENSION_BUFFER &&
+            resource_copy_dst_state_->GetDesc().Width == buffer_size));
     }
 
     D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
     {
-        std::swap(this->m_resource, that.m_resource);
+        std::swap(this->resource_uav_state_, that.resource_uav_state_);
+        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->first_valid_resource_, that.first_valid_resource_);
     }
 
     D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
     {
-        std::swap(this->m_resource, that.m_resource);
+        std::swap(this->resource_uav_state_, that.resource_uav_state_);
+        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
+        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->first_valid_resource_, that.first_valid_resource_);
         return *this;
     }
 
     ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
     {
-        return m_resource;
+        return resource_uav_state_;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const
+    {
+        return resource_copy_src_state_;
+    }
+
+    ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const
+    {
+        return resource_copy_dst_state_;
     }
 
     uint64_t D3D12BufferRegion::Offset() const
     {
-        return m_resource ? offset_ : 0;
+        return first_valid_resource_ ? offset_ : 0;
     }
 
     uint64_t D3D12BufferRegion::SizeInBytes() const
     {
-        return m_resource ? size_in_bytes_ : 0;
+        return first_valid_resource_ ? size_in_bytes_ : 0;
     }
 
     DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
     {
-        if (!m_resource)
+        if (!resource_uav_state_)
         {
             return DML_BUFFER_BINDING{};
         }
 
-        return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_};
+        return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index 6c5cb37297caa..dee01a29fe55f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -19,39 +19,61 @@ namespace Dml
         D3D12BufferRegion(
             uint64_t offset,
             uint64_t size_in_bytes,
-            ID3D12Resource* resource);
+            ID3D12Resource* resource_uav_state,
+            ID3D12Resource* resource_copy_src_state,
+            ID3D12Resource* resource_copy_dst_state);
 
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
         D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
         D3D12BufferRegion(D3D12BufferRegion&&);
         D3D12BufferRegion& operator=(D3D12BufferRegion&&);
+
         ID3D12Resource* ResourceInUavState() const;
 
+        // NOTE: may be any state that is valid as a copy source (COPY_SRC,
+        // GENERIC_READ, or COMMON).
+        ID3D12Resource* ResourceInCopySrcState() const;
+
+        ID3D12Resource* ResourceInCopyDstState() const;
+
         uint64_t Offset() const;
         uint64_t SizeInBytes() const;
 
         DML_BUFFER_BINDING GetBufferBinding() const;
 
-        explicit operator bool() const { return m_resource != nullptr; }
+        explicit operator bool() const { return first_valid_resource_ != nullptr; }
 
         // Creates a subregion at an offset from the start of this region. If no
         // size is provided the region runs to the end of the current region.
-        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const
+        inline D3D12BufferRegion Subregion(
+            uint64_t offset,
+            uint64_t size_in_bytes = 0) const
         {
             // start of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
-            size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            size_in_bytes =
+                size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
             // end of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
 
-            return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource);
+            return D3D12BufferRegion(
+                offset_ + offset,
+                size_in_bytes,
+                resource_uav_state_,
+                resource_copy_src_state_,
+                resource_copy_dst_state_);
         }
 
     private:
-        ID3D12Resource* m_resource = nullptr;
+        ID3D12Resource* resource_uav_state_ = nullptr;
+        ID3D12Resource* resource_copy_src_state_ = nullptr;
+        ID3D12Resource* resource_copy_dst_state_ = nullptr;
         uint64_t offset_ = 0;
         uint64_t size_in_bytes_ = 0;
+
+        // Pointer to the first resource above that isn't null.
+        ID3D12Resource* first_valid_resource_ = nullptr;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 862884c22b08c..af625334b7720 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -94,8 +94,10 @@ void DmlCommandRecorder::InitializeOperator(
     if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) ||
         (temporaryResourceSize > 0))
     {
-        auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
-        m_currentCommandList->ResourceBarrier(1, &uav);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
     }
 }
 
@@ -154,8 +156,13 @@ void DmlCommandRecorder::ExecuteOperator(
     // Barrier all outputs.
     #pragma warning(push)
     #pragma warning(disable: 6387)
-    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
-    m_currentCommandList->ResourceBarrier(1, &uav);
+
+    // Barrier all outputs.
+    D3D12_RESOURCE_BARRIER barriers[] = {
+        CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+        CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+    m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
+
     #pragma warning(pop)
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index 4b9c167dfe671..f786cca837f06 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -12,6 +12,12 @@ namespace Dml
 
         // Committed resources use the same resource for all states and use barriers to transition between states
         ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index dcf6b8607f319..cd6b241e70d48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
             *allocId = 0;
-            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
+            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
         }
 
         auto taggedPointer = TaggedPointer::Unpack(opaqueData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index 0dc07384ea905..c82e0a4f5d722 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -400,7 +400,9 @@ namespace Dml
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
-            it->second->GetUavResource());
+            it->second->GetUavResource(),
+            it->second->GetCopySrcResource(),
+            it->second->GetCopyDstResource());
     }
 
     AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 8049848c8671e..1d7c8704ab7da 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -60,7 +60,9 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(
+            const TaggedPointer& taggedPointer,
+            uint64_t size_in_bytes);
 
         AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 22f8cbbdc394b..68feab568ca45 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -18,6 +18,12 @@ namespace Dml
         }
 
         ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
+        ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
+        ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
+
+        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
+        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; }
+        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 2b1a8e5c726dc..03e9f762b7eb4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,8 +11,12 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
-        // TODO (pavignol): Rename to GetResource()
         virtual ID3D12Resource* GetUavResource() const = 0;
+        virtual ID3D12Resource* GetCopySrcResource() const = 0;
+        virtual ID3D12Resource* GetCopyDstResource() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0;
+        virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index b5492a1a86ea3..d6a46e354c769 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -100,6 +100,8 @@ namespace Dml
             barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
         }
 
+        // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 
         // Keep the intermediate buffer alive until we're done with it
@@ -133,6 +135,7 @@ namespace Dml
 
         // Since this copy may write to GPU memory, we also need to perform an
         // aliasing barrier
+        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 */
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index b06d23adf5886..f72035f5e5fda 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -443,8 +443,15 @@ namespace Dml
             // CPU -> GPU copy (upload)
             //
             auto dstBufferRegion = GetBufferForTensor(dst);
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
-            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? dstBufferRegion.ResourceInUavState()
+                : dstBufferRegion.ResourceInCopyDstState();
+
+            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
+
             const uint64_t dstOffset = dstBufferRegion.Offset();
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
             FlushUploadsIfReady();
@@ -455,26 +462,47 @@ namespace Dml
             // GPU -> CPU copy (readback)
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+
             const uint64_t srcOffset = srcBufferRegion.Offset();
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
+            printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
+
             //
             // GPU -> GPU copy
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-            const uint64_t srcOffset = srcBufferRegion.Offset();
+
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             auto dstBufferRegion = GetBufferForTensor(dst);
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
-            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-            const uint64_t dstOffset = dstBufferRegion.Offset();
 
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? dstBufferRegion.ResourceInUavState()
+                : dstBufferRegion.ResourceInCopyDstState();
+
+            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_DEST;
+
+            const uint64_t srcOffset = srcBufferRegion.Offset();
+            const uint64_t dstOffset = dstBufferRegion.Offset();
             m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes);
         }
         else
@@ -496,6 +524,7 @@ namespace Dml
 
         // Source and destination for batched GPU -> CPU copies
         std::vector<ID3D12Resource*> srcDatas;
+        std::vector<D3D12_RESOURCE_STATES> srcStates;
         std::vector<uint64_t> srcOffsets;
         std::vector<void*> dstDatas;
         std::vector<uint32_t> dataSizesInBytes;
@@ -528,16 +557,21 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(src[i]);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             srcDatas.push_back(srcData);
+            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
 
         return S_OK;
         }
@@ -907,6 +941,9 @@ namespace Dml
         std::vector<ID3D12Resource*> srcDatas;
         srcDatas.reserve(src_dst_pairs.size());
 
+        std::vector<D3D12_RESOURCE_STATES> srcStates;
+        srcStates.reserve(src_dst_pairs.size());
+
         std::vector<uint64_t> srcOffsets;
         srcOffsets.reserve(src_dst_pairs.size());
 
@@ -956,16 +993,21 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? srcBufferRegion.ResourceInUavState()
+                : srcBufferRegion.ResourceInCopySrcState();
+
+            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
+                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+                : D3D12_RESOURCE_STATE_COPY_SOURCE;
 
             srcDatas.push_back(srcData);
+            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
 
         return onnxruntime::common::Status::OK();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index b00b8f8e19f52..35955b113b2c1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -124,7 +124,9 @@ namespace Dml
                         inputBufferRegions[i] = D3D12BufferRegion(
                             0,
                             m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width,
-                            m_nonOwnedGraphInputsFromInitializers[i].Get());
+                            m_nonOwnedGraphInputsFromInitializers[i].Get(),
+                            nullptr,
+                            nullptr);
                     }
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index dde290f0bce0f..1e7ebdc234c22 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter
         if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
         {
             auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
-            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
+            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
         }
 
         auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
@@ -1716,6 +1716,8 @@ namespace Windows::AI::MachineLearning::Adapter
         }
     }
 
+    // TODO (pavignol): Fix once we go back to a single resource
+    /*
     void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp)
     {
         if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
@@ -1767,9 +1769,9 @@ namespace Windows::AI::MachineLearning::Adapter
                 }
             }
 
-            for (auto& tempBuffer : m_temporaryBuffers)
+            for (auto& tempAlloc : m_temporaryAbiAllocations)
             {
-                resourcesToTransition.push_back(tempBuffer.ResourceInUavState());
+                resourcesToTransition.push_back(tempAlloc.Get());
             }
 
             m_winmlProvider->TransitionResourcesForOperator(
@@ -1778,6 +1780,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 resourcesToTransition.data());
         }
     }
+    */
 
     OpKernelContextWrapper::OpKernelContextWrapper(
         onnxruntime::OpKernelContext* context,
@@ -1825,10 +1828,13 @@ namespace Windows::AI::MachineLearning::Adapter
 
     void OpKernelContextWrapper::Close()
     {
+        // TODO (pavignol): Fix once we go back to a single resource
+        /*
         if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
         {
             TransitionResourcesForOperatorIfRequired(false);
         }
+        */
 
         for (auto& tensors : m_inputTensors)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 85b6b197fe511..647e0a17d26df 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -512,7 +512,9 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext, IMLOpe
 
  protected:
     void ClearTempAllocations();
-    void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
+
+    // TODO (pavignol): Fix once we go back to a single resource
+    // void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
 
     // Lifetime is managed by the caller and guaranteed to outlive this class
     onnxruntime::OpKernelContext* m_impl = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 1e3035648adcb..ed1a6ebe49171 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -727,28 +727,9 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams;
 
         // Get resources
-        auto inputBufferRegion =  bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion;
-        auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion;
         auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion;
         auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion;
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[2];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(2, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get());
         commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get());
@@ -783,21 +764,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         chirpLength *= (m_isInverse ? 1 : -1);
         float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f;
         StockhamFFT(fft_params, true,  chirpLength, scale, commandList);
-
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        commandList->ResourceBarrier(2, barriers);
     }
 
     void StockhamFFT(
@@ -813,27 +779,8 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& loopList = stockhamParams.ResourceLoopList;
 
         // Get input and output resources
-        auto inputBufferRegion = loopList[0].BufferRegion;
-        auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion;
         auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion;
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[2];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(2, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get());
         commandList->SetPipelineState(m_stockhamFFTPipelineState.Get());
@@ -875,21 +822,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             std::array<Dml::D3D12BufferRegion, 3> uav_resources = { in, out, window };
             Dispatch(uav_resources, constants, commandList);
         }
-
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -906,14 +838,6 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
-
-        std::transform(
-            bufferRegions.begin(), bufferRegions.end(),
-            uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
-        commandList->ResourceBarrier(TSize, uav_barriers);
-
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -962,7 +886,10 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(TSize, uav_barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 0611c4b7bf7f7..8863bd5362d27 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -683,29 +683,6 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         Dml::GetDescendingPackedStrides(gridDims, gridStrides);
         Dml::GetDescendingPackedStrides(outputDims, outputStrides);
 
-        // Transition resources from common to UAV state
-        D3D12_RESOURCE_BARRIER barriers[3];
-
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_COMMON,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-        );
-
-        commandList->ResourceBarrier(3, barriers);
-
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get());
         commandList->SetPipelineState(m_gridSamplePipelineState.Get());
@@ -727,26 +704,10 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::array<Dml::D3D12BufferRegion, 3> uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion };
         Dispatch(uavBufferRegions, constants, commandList);
 
-        // Transition resources to common state
-        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            D3D12_RESOURCE_STATE_COMMON
-        );
-
-        commandList->ResourceBarrier(3, barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -763,14 +724,6 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
-        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
-
-        std::transform(
-            bufferRegions.begin(), bufferRegions.end(),
-            uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
-        commandList->ResourceBarrier(TSize, uav_barriers);
-
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -819,7 +772,10 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        commandList->ResourceBarrier(TSize, uav_barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index 945b58965cf2f..cd1f78e2a23a6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -405,15 +405,9 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         std::array<DML_BINDING_DESC, 2> inputBindings;
         uint32_t inputBindingsCount = 1;
 
-        // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking
-        // barrierCount is outside the valid range.
-        D3D12_RESOURCE_BARRIER barriers[3];
-        uint32_t barrierCount = 0;
-
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
-        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         Dml::D3D12BufferRegion windowBufferRegion;
         if (m_framingOperator.hasWindowTensor)
@@ -421,7 +415,6 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window);
             inputBuffers[1] = windowBufferRegion.GetBufferBinding();
             inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] };
-            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
             inputBindingsCount++;
         }
 
@@ -429,7 +422,6 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding();
         DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer };
-        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         m_framingOperator.bindingTable->BindOutputs(1, &outputBinding);
 
@@ -451,22 +443,16 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc);
         }
 
-        // Transition resources COMMON -> UAV
-        commandList->ResourceBarrier(barrierCount, barriers);
-
         m_framingOperator.commandRecorder->RecordDispatch(
             commandList,
             m_framingOperator.op.Get(),
             m_framingOperator.bindingTable.Get()
         );
 
-        // Transition resources UAV -> COMMON
-        for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++)
-        {
-            std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter);
-        }
-
-        commandList->ResourceBarrier(barrierCount, barriers);
+        D3D12_RESOURCE_BARRIER barriers[] = {
+            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
+            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
+        commandList->ResourceBarrier(2, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index 5bb04ba4d30b5..a91886c3b5863 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -106,7 +106,7 @@ namespace Dml
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
         gsl::span<uint64_t> srcOffsets,
-        D3D12_RESOURCE_STATES srcState)
+        gsl::span<const D3D12_RESOURCE_STATES> srcStates)
     {
         assert(dst.size() == src.size());
         assert(dstSizes.size() == src.size());
@@ -134,7 +134,7 @@ namespace Dml
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
                 srcOffsets[i],
-                srcState,
+                srcStates[i],
                 dstSizes[i]);
 
             offset += dstSizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index 4a65ce899d791..f888f0a55ac48 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -28,7 +28,7 @@ namespace Dml
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
             gsl::span<uint64_t> srcOffsets,
-            D3D12_RESOURCE_STATES srcState);
+            gsl::span<const D3D12_RESOURCE_STATES> srcStates);
 
     private:
         void EnsureReadbackHeap(size_t size);

From 57d2f46c7e55ad6cf067934a3f208abe955db67c Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 12 Jul 2023 12:59:47 -0700
Subject: [PATCH 45/76] Remove aliasing

This reverts commit 587489d84711cdc47da2b2cda5952fe9bd2cb3c7.
---
 .../src/DmlAllocationInfo.h                   | 10 ---
 .../DmlExecutionProvider/src/DmlBuffer.cpp    | 10 ---
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  3 +-
 .../src/DmlBufferRegion.cpp                   | 76 +++--------------
 .../src/DmlBufferRegion.h                     | 34 ++------
 .../src/DmlCommandRecorder.cpp                | 15 +---
 .../src/DmlCommittedResourceWrapper.h         |  6 --
 .../src/DmlGraphFusionHelper.cpp              |  2 +-
 .../src/DmlReservedResourceSubAllocator.cpp   |  4 +-
 .../src/DmlReservedResourceSubAllocator.h     |  4 +-
 .../src/DmlReservedResourceWrapper.h          |  6 --
 .../src/DmlResourceWrapper.h                  |  6 +-
 .../src/ExecutionContext.cpp                  |  3 -
 .../src/ExecutionProvider.cpp                 | 78 +++++-------------
 .../src/FusedGraphKernel.cpp                  |  4 +-
 .../src/MLOperatorAuthorImpl.cpp              | 12 +--
 .../src/MLOperatorAuthorImpl.h                |  4 +-
 .../src/Operators/DmlDFT.h                    | 81 ++++++++++++++++++-
 .../src/Operators/DmlGridSample.h             | 60 ++++++++++++--
 .../src/Operators/DmlSTFT.h                   | 22 ++++-
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  4 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |  2 +-
 22 files changed, 201 insertions(+), 245 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 546a42342a2a0..7c11358bb106d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -36,16 +36,6 @@ namespace Dml
             return m_resourceWrapper->GetUavResource();
         }
 
-        ID3D12Resource* GetCopySrcResource() const
-        {
-            return m_resourceWrapper->GetCopySrcResource();
-        }
-
-        ID3D12Resource* GetCopyDstResource() const
-        {
-            return m_resourceWrapper->GetCopyDstResource();
-        }
-
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
         {
             return std::move(m_resourceWrapper);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index c5fa576d24a0f..464ce26c16f54 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -48,16 +48,6 @@ ID3D12Resource* DmlBuffer::ResourceInUavState() const
     return buffer_region_.ResourceInUavState();
 }
 
-ID3D12Resource* DmlBuffer::ResourceInCopySrcState() const
-{
-    return buffer_region_.ResourceInCopySrcState();
-}
-
-ID3D12Resource* DmlBuffer::ResourceInCopyDstState() const
-{
-    return buffer_region_.ResourceInCopyDstState();
-}
-
 uint64_t DmlBuffer::Offset() const
 {
     return buffer_region_ ? buffer_region_.Offset() : 0;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index b98ae727e1a65..4b0dd58ce4467 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -27,9 +27,8 @@ class DmlBuffer
     DmlBuffer(DmlBuffer&&);
     DmlBuffer& operator=(DmlBuffer&&);
 
+    // TODO (pavignol): Rename to Resource()
     ID3D12Resource* ResourceInUavState() const;
-    ID3D12Resource* ResourceInCopySrcState() const;
-    ID3D12Resource* ResourceInCopyDstState() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
     const D3D12BufferRegion& Region() const { return buffer_region_; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index 3240042b5b6a6..c33cc5491c7f0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -7,114 +7,64 @@
 namespace Dml
 {
 
-    D3D12BufferRegion::D3D12BufferRegion(
-        uint64_t offset,
-        uint64_t size_in_bytes,
-        ID3D12Resource* resource_uav_state,
-        ID3D12Resource* resource_copy_src_state,
-        ID3D12Resource* resource_copy_dst_state)
-        : resource_uav_state_(resource_uav_state),
-        resource_copy_src_state_(resource_copy_src_state),
-        resource_copy_dst_state_(resource_copy_dst_state),
+    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource)
+        : m_resource(resource),
         offset_(offset),
         size_in_bytes_(size_in_bytes)
     {
-        // Get a raw pointer to the first non-null resource passed in. At least one
-        // resource must be provided.
-        first_valid_resource_ = resource_uav_state_;
-        if (!first_valid_resource_)
-        {
-            first_valid_resource_ = resource_copy_src_state_;
-        }
-        if (!first_valid_resource_)
-        {
-            first_valid_resource_ = resource_copy_dst_state_;
-        }
-        ORT_THROW_HR_IF(E_INVALIDARG, first_valid_resource_ == nullptr);
+        ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr);
 
         // Regions cannot be empty.
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
 
         // Regions cannot extend beyond the size of the resource.
-        uint64_t buffer_size = first_valid_resource_->GetDesc().Width;
+        uint64_t buffer_size = m_resource->GetDesc().Width;
         ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
         ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
 
         // All three resources, if provided, must be identical aside from state.
-        assert(
-            first_valid_resource_->GetDesc().Dimension ==
-            D3D12_RESOURCE_DIMENSION_BUFFER);
-        assert(
-            !resource_uav_state ||
-            (resource_uav_state->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_uav_state->GetDesc().Width == buffer_size));
-        assert(
-            !resource_copy_src_state_ ||
-            (resource_copy_src_state_->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_copy_src_state_->GetDesc().Width == buffer_size));
-        assert(
-            !resource_copy_dst_state_ ||
-            (resource_copy_dst_state_->GetDesc().Dimension ==
-                D3D12_RESOURCE_DIMENSION_BUFFER &&
-            resource_copy_dst_state_->GetDesc().Width == buffer_size));
+        assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
+        assert(m_resource->GetDesc().Width == buffer_size);
     }
 
     D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
     {
-        std::swap(this->resource_uav_state_, that.resource_uav_state_);
-        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
-        std::swap(this->first_valid_resource_, that.first_valid_resource_);
     }
 
     D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
     {
-        std::swap(this->resource_uav_state_, that.resource_uav_state_);
-        std::swap(this->resource_copy_src_state_, that.resource_copy_src_state_);
-        std::swap(this->resource_copy_dst_state_, that.resource_copy_dst_state_);
+        std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
-        std::swap(this->first_valid_resource_, that.first_valid_resource_);
         return *this;
     }
 
     ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
     {
-        return resource_uav_state_;
-    }
-
-    ID3D12Resource* D3D12BufferRegion::ResourceInCopySrcState() const
-    {
-        return resource_copy_src_state_;
-    }
-
-    ID3D12Resource* D3D12BufferRegion::ResourceInCopyDstState() const
-    {
-        return resource_copy_dst_state_;
+        return m_resource;
     }
 
     uint64_t D3D12BufferRegion::Offset() const
     {
-        return first_valid_resource_ ? offset_ : 0;
+        return m_resource ? offset_ : 0;
     }
 
     uint64_t D3D12BufferRegion::SizeInBytes() const
     {
-        return first_valid_resource_ ? size_in_bytes_ : 0;
+        return m_resource ? size_in_bytes_ : 0;
     }
 
     DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
     {
-        if (!resource_uav_state_)
+        if (!m_resource)
         {
             return DML_BUFFER_BINDING{};
         }
 
-        return DML_BUFFER_BINDING{resource_uav_state_, offset_, size_in_bytes_};
+        return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_};
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index dee01a29fe55f..6c5cb37297caa 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -19,61 +19,39 @@ namespace Dml
         D3D12BufferRegion(
             uint64_t offset,
             uint64_t size_in_bytes,
-            ID3D12Resource* resource_uav_state,
-            ID3D12Resource* resource_copy_src_state,
-            ID3D12Resource* resource_copy_dst_state);
+            ID3D12Resource* resource);
 
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
         D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
         D3D12BufferRegion(D3D12BufferRegion&&);
         D3D12BufferRegion& operator=(D3D12BufferRegion&&);
-
         ID3D12Resource* ResourceInUavState() const;
 
-        // NOTE: may be any state that is valid as a copy source (COPY_SRC,
-        // GENERIC_READ, or COMMON).
-        ID3D12Resource* ResourceInCopySrcState() const;
-
-        ID3D12Resource* ResourceInCopyDstState() const;
-
         uint64_t Offset() const;
         uint64_t SizeInBytes() const;
 
         DML_BUFFER_BINDING GetBufferBinding() const;
 
-        explicit operator bool() const { return first_valid_resource_ != nullptr; }
+        explicit operator bool() const { return m_resource != nullptr; }
 
         // Creates a subregion at an offset from the start of this region. If no
         // size is provided the region runs to the end of the current region.
-        inline D3D12BufferRegion Subregion(
-            uint64_t offset,
-            uint64_t size_in_bytes = 0) const
+        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const
         {
             // start of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
-            size_in_bytes =
-                size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
             // end of subregion must be within current region
             ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
 
-            return D3D12BufferRegion(
-                offset_ + offset,
-                size_in_bytes,
-                resource_uav_state_,
-                resource_copy_src_state_,
-                resource_copy_dst_state_);
+            return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource);
         }
 
     private:
-        ID3D12Resource* resource_uav_state_ = nullptr;
-        ID3D12Resource* resource_copy_src_state_ = nullptr;
-        ID3D12Resource* resource_copy_dst_state_ = nullptr;
+        ID3D12Resource* m_resource = nullptr;
         uint64_t offset_ = 0;
         uint64_t size_in_bytes_ = 0;
-
-        // Pointer to the first resource above that isn't null.
-        ID3D12Resource* first_valid_resource_ = nullptr;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index af625334b7720..862884c22b08c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -94,10 +94,8 @@ void DmlCommandRecorder::InitializeOperator(
     if ((persistentResourceBinding.Type != DML_BINDING_TYPE_NONE) ||
         (temporaryResourceSize > 0))
     {
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
+        auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+        m_currentCommandList->ResourceBarrier(1, &uav);
     }
 }
 
@@ -156,13 +154,8 @@ void DmlCommandRecorder::ExecuteOperator(
     // Barrier all outputs.
     #pragma warning(push)
     #pragma warning(disable: 6387)
-
-    // Barrier all outputs.
-    D3D12_RESOURCE_BARRIER barriers[] = {
-        CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-        CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-    m_currentCommandList->ResourceBarrier(ABSL_ARRAYSIZE(barriers), barriers);
-
+    auto uav = CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
+    m_currentCommandList->ResourceBarrier(1, &uav);
     #pragma warning(pop)
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index f786cca837f06..4b9c167dfe671 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -12,12 +12,6 @@ namespace Dml
 
         // Committed resources use the same resource for all states and use barriers to transition between states
         ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetCopySrcResource() const final { return m_d3d12Resource.Get(); }
-        ID3D12Resource* GetCopyDstResource() const final { return m_d3d12Resource.Get(); }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index cd6b241e70d48..dcf6b8607f319 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -103,7 +103,7 @@ namespace DmlGraphFusionHelper
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
             *allocId = 0;
-            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
         }
 
         auto taggedPointer = TaggedPointer::Unpack(opaqueData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index c82e0a4f5d722..0dc07384ea905 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -400,9 +400,7 @@ namespace Dml
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
-            it->second->GetUavResource(),
-            it->second->GetCopySrcResource(),
-            it->second->GetCopyDstResource());
+            it->second->GetUavResource());
     }
 
     AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 1d7c8704ab7da..8049848c8671e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -60,9 +60,7 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(
-            const TaggedPointer& taggedPointer,
-            uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
 
         AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 68feab568ca45..22f8cbbdc394b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -18,12 +18,6 @@ namespace Dml
         }
 
         ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
-        ID3D12Resource* GetCopySrcResource() const final { return m_allocation.resource_copy_src_state.Get(); }
-        ID3D12Resource* GetCopyDstResource() const final { return m_allocation.resource_copy_dst_state.Get(); }
-
-        D3D12_RESOURCE_STATES GetDefaultUavState() const final { return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; }
-        D3D12_RESOURCE_STATES GetDefaultCopySrcState() const final { return D3D12_RESOURCE_STATE_COPY_SOURCE; }
-        D3D12_RESOURCE_STATES GetDefaultCopyDstState() const final { return D3D12_RESOURCE_STATE_COPY_DEST; }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 03e9f762b7eb4..2b1a8e5c726dc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,12 +11,8 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
+        // TODO (pavignol): Rename to GetResource()
         virtual ID3D12Resource* GetUavResource() const = 0;
-        virtual ID3D12Resource* GetCopySrcResource() const = 0;
-        virtual ID3D12Resource* GetCopyDstResource() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultUavState() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultCopySrcState() const = 0;
-        virtual D3D12_RESOURCE_STATES GetDefaultCopyDstState() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index d6a46e354c769..b5492a1a86ea3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -100,8 +100,6 @@ namespace Dml
             barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
         }
 
-        // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 
         // Keep the intermediate buffer alive until we're done with it
@@ -135,7 +133,6 @@ namespace Dml
 
         // Since this copy may write to GPU memory, we also need to perform an
         // aliasing barrier
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
         m_dmlRecorder.ResourceBarrier(barriers);
 */
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index f72035f5e5fda..b06d23adf5886 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -443,15 +443,8 @@ namespace Dml
             // CPU -> GPU copy (upload)
             //
             auto dstBufferRegion = GetBufferForTensor(dst);
-
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? dstBufferRegion.ResourceInUavState()
-                : dstBufferRegion.ResourceInCopyDstState();
-
-            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
-
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
             FlushUploadsIfReady();
@@ -462,47 +455,26 @@ namespace Dml
             // GPU -> CPU copy (readback)
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
-
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t srcOffset = srcBufferRegion.Offset();
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
         }
         else if (!src->IsCpuData() && !dst->IsCpuData())
         {
-            printf("*****************DmlCommandRecorder::CopyBufferRegion\n");
-
             //
             // GPU -> GPU copy
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+            const uint64_t srcOffset = srcBufferRegion.Offset();
 
             auto dstBufferRegion = GetBufferForTensor(dst);
-
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? dstBufferRegion.ResourceInUavState()
-                : dstBufferRegion.ResourceInCopyDstState();
-
-            const auto dstState = dstBufferRegion.ResourceInCopyDstState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_DEST;
-
-            const uint64_t srcOffset = srcBufferRegion.Offset();
+            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
+
             m_context->CopyBufferRegion(dstData, dstOffset, dstState, srcData, srcOffset, srcState, dataSizeInBytes);
         }
         else
@@ -524,7 +496,6 @@ namespace Dml
 
         // Source and destination for batched GPU -> CPU copies
         std::vector<ID3D12Resource*> srcDatas;
-        std::vector<D3D12_RESOURCE_STATES> srcStates;
         std::vector<uint64_t> srcOffsets;
         std::vector<void*> dstDatas;
         std::vector<uint32_t> dataSizesInBytes;
@@ -557,21 +528,16 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(src[i]);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
-            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
+        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
 
         return S_OK;
         }
@@ -941,9 +907,6 @@ namespace Dml
         std::vector<ID3D12Resource*> srcDatas;
         srcDatas.reserve(src_dst_pairs.size());
 
-        std::vector<D3D12_RESOURCE_STATES> srcStates;
-        srcStates.reserve(src_dst_pairs.size());
-
         std::vector<uint64_t> srcOffsets;
         srcOffsets.reserve(src_dst_pairs.size());
 
@@ -993,21 +956,16 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? srcBufferRegion.ResourceInUavState()
-                : srcBufferRegion.ResourceInCopySrcState();
-
-            const auto srcState = srcBufferRegion.ResourceInCopySrcState() == nullptr
-                ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                : D3D12_RESOURCE_STATE_COPY_SOURCE;
+            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
-            srcStates.push_back(srcState);
             srcOffsets.push_back(srcBufferRegion.Offset());
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcStates);
+        const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
 
         return onnxruntime::common::Status::OK();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 35955b113b2c1..b00b8f8e19f52 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -124,9 +124,7 @@ namespace Dml
                         inputBufferRegions[i] = D3D12BufferRegion(
                             0,
                             m_nonOwnedGraphInputsFromInitializers[i]->GetDesc().Width,
-                            m_nonOwnedGraphInputsFromInitializers[i].Get(),
-                            nullptr,
-                            nullptr);
+                            m_nonOwnedGraphInputsFromInitializers[i].Get());
                     }
                     else if (!m_isInputsUploadedByDmlEP[i])
                     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 1e7ebdc234c22..dde290f0bce0f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1617,7 +1617,7 @@ namespace Windows::AI::MachineLearning::Adapter
         if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
         {
             auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
-            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource(), nullptr, nullptr);
+            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
         }
 
         auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
@@ -1716,8 +1716,6 @@ namespace Windows::AI::MachineLearning::Adapter
         }
     }
 
-    // TODO (pavignol): Fix once we go back to a single resource
-    /*
     void OpKernelContextWrapper::TransitionResourcesForOperatorIfRequired(bool isBeforeOp)
     {
         if (m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
@@ -1769,9 +1767,9 @@ namespace Windows::AI::MachineLearning::Adapter
                 }
             }
 
-            for (auto& tempAlloc : m_temporaryAbiAllocations)
+            for (auto& tempBuffer : m_temporaryBuffers)
             {
-                resourcesToTransition.push_back(tempAlloc.Get());
+                resourcesToTransition.push_back(tempBuffer.ResourceInUavState());
             }
 
             m_winmlProvider->TransitionResourcesForOperator(
@@ -1780,7 +1778,6 @@ namespace Windows::AI::MachineLearning::Adapter
                 resourcesToTransition.data());
         }
     }
-    */
 
     OpKernelContextWrapper::OpKernelContextWrapper(
         onnxruntime::OpKernelContext* context,
@@ -1828,13 +1825,10 @@ namespace Windows::AI::MachineLearning::Adapter
 
     void OpKernelContextWrapper::Close()
     {
-        // TODO (pavignol): Fix once we go back to a single resource
-        /*
         if (m_winmlProvider && m_winmlProvider->TransitionsRequiredForOperator(m_internalOperator))
         {
             TransitionResourcesForOperatorIfRequired(false);
         }
-        */
 
         for (auto& tensors : m_inputTensors)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 647e0a17d26df..85b6b197fe511 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -512,9 +512,7 @@ class OpKernelContextWrapper : public WRL::Base<IMLOperatorKernelContext, IMLOpe
 
  protected:
     void ClearTempAllocations();
-
-    // TODO (pavignol): Fix once we go back to a single resource
-    // void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
+    void TransitionResourcesForOperatorIfRequired(bool isBeforeOp);
 
     // Lifetime is managed by the caller and guaranteed to outlive this class
     onnxruntime::OpKernelContext* m_impl = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index ed1a6ebe49171..1e3035648adcb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -727,9 +727,28 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& bluesteinZChirpParams = dftParams.BluesteinZChirpParams;
 
         // Get resources
+        auto inputBufferRegion =  bluesteinZChirpParams.AFFTParams.ResourceLoopList.front().BufferRegion;
+        auto outputBufferRegion = bluesteinZChirpParams.AFFTInverseParams.ResourceLoopList[bluesteinZChirpParams.AFFTInverseParams.OutputIndex].BufferRegion;
         auto zChirpBufferRegion = bluesteinZChirpParams.ZChirp.BufferRegion;
         auto bBufferRegion = bluesteinZChirpParams.B.BufferRegion;
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[2];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(2, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_bluesteinChirpRootSignature.Get());
         commandList->SetPipelineState(m_bluesteinChirpPipelineState.Get());
@@ -764,6 +783,21 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         chirpLength *= (m_isInverse ? 1 : -1);
         float scale = isInverse ? 1.f / dftParams.DFTLength : 1.f;
         StockhamFFT(fft_params, true,  chirpLength, scale, commandList);
+
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(2, barriers);
     }
 
     void StockhamFFT(
@@ -779,8 +813,27 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         const auto& loopList = stockhamParams.ResourceLoopList;
 
         // Get input and output resources
+        auto inputBufferRegion = loopList[0].BufferRegion;
+        auto outputBufferRegion = loopList[stockhamParams.OutputIndex].BufferRegion;
         auto windowBufferRegion = dftParams.StockhamParams.Window.BufferRegion;
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[2];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(2, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_stockhamFFTRootSignature.Get());
         commandList->SetPipelineState(m_stockhamFFTPipelineState.Get());
@@ -822,6 +875,21 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             std::array<Dml::D3D12BufferRegion, 3> uav_resources = { in, out, window };
             Dispatch(uav_resources, constants, commandList);
         }
+
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(2, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -838,6 +906,14 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
+
+        std::transform(
+            bufferRegions.begin(), bufferRegions.end(),
+            uav_barriers,
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+        commandList->ResourceBarrier(TSize, uav_barriers);
+
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -886,10 +962,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        commandList->ResourceBarrier(TSize, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 8863bd5362d27..0611c4b7bf7f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -683,6 +683,29 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         Dml::GetDescendingPackedStrides(gridDims, gridStrides);
         Dml::GetDescendingPackedStrides(outputDims, outputStrides);
 
+        // Transition resources from common to UAV state
+        D3D12_RESOURCE_BARRIER barriers[3];
+
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            gridBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_COMMON,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS
+        );
+
+        commandList->ResourceBarrier(3, barriers);
+
         // Set the root signature and pipeline state
         commandList->SetComputeRootSignature(m_gridSampleRootSignature.Get());
         commandList->SetPipelineState(m_gridSamplePipelineState.Get());
@@ -704,10 +727,26 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::array<Dml::D3D12BufferRegion, 3> uavBufferRegions = { inputBufferRegion, gridBufferRegion, outputBufferRegion };
         Dispatch(uavBufferRegions, constants, commandList);
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        // Transition resources to common state
+        barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
+            inputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
+            gridBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
+            outputBufferRegion.ResourceInUavState(),
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON
+        );
+
+        commandList->ResourceBarrier(3, barriers);
     }
 
     std::vector<uint32_t> GetTensorDimensions(IMLOperatorTensor* tensor)
@@ -724,6 +763,14 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         TConstants& constants,
         ID3D12GraphicsCommandList* commandList)
     {
+        D3D12_RESOURCE_BARRIER uav_barriers[TSize];
+
+        std::transform(
+            bufferRegions.begin(), bufferRegions.end(),
+            uav_barriers,
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+        commandList->ResourceBarrier(TSize, uav_barriers);
+
         for (uint32_t i = 0; i < TSize; i++)
         {
             // Set resource views
@@ -772,10 +819,7 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             commandList->Dispatch(dispatchSizeX, 1, 1);
         }
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        commandList->ResourceBarrier(TSize, uav_barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index cd1f78e2a23a6..945b58965cf2f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -405,9 +405,15 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         std::array<DML_BINDING_DESC, 2> inputBindings;
         uint32_t inputBindingsCount = 1;
 
+        // NOTE: avoiding std::array for barriers to avoid buggy code analysis thinking
+        // barrierCount is outside the valid range.
+        D3D12_RESOURCE_BARRIER barriers[3];
+        uint32_t barrierCount = 0;
+
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         Dml::D3D12BufferRegion windowBufferRegion;
         if (m_framingOperator.hasWindowTensor)
@@ -415,6 +421,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window);
             inputBuffers[1] = windowBufferRegion.GetBufferBinding();
             inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] };
+            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
             inputBindingsCount++;
         }
 
@@ -422,6 +429,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding();
         DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer };
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         m_framingOperator.bindingTable->BindOutputs(1, &outputBinding);
 
@@ -443,16 +451,22 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             m_framingOperator.bindingTable->BindPersistentResource(&bindingDesc);
         }
 
+        // Transition resources COMMON -> UAV
+        commandList->ResourceBarrier(barrierCount, barriers);
+
         m_framingOperator.commandRecorder->RecordDispatch(
             commandList,
             m_framingOperator.op.Get(),
             m_framingOperator.bindingTable.Get()
         );
 
-        D3D12_RESOURCE_BARRIER barriers[] = {
-            CD3DX12_RESOURCE_BARRIER::UAV(nullptr),
-            CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr)};
-        commandList->ResourceBarrier(2, barriers);
+        // Transition resources UAV -> COMMON
+        for (uint32_t barrierIndex = 0; barrierIndex < barrierCount; barrierIndex++)
+        {
+            std::swap(barriers[barrierIndex].Transition.StateBefore, barriers[barrierIndex].Transition.StateAfter);
+        }
+
+        commandList->ResourceBarrier(barrierCount, barriers);
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index a91886c3b5863..5bb04ba4d30b5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -106,7 +106,7 @@ namespace Dml
         gsl::span<const uint32_t > dstSizes,
         gsl::span<ID3D12Resource*> src,
         gsl::span<uint64_t> srcOffsets,
-        gsl::span<const D3D12_RESOURCE_STATES> srcStates)
+        D3D12_RESOURCE_STATES srcState)
     {
         assert(dst.size() == src.size());
         assert(dstSizes.size() == src.size());
@@ -134,7 +134,7 @@ namespace Dml
                 D3D12_RESOURCE_STATE_COPY_DEST,
                 src[i],
                 srcOffsets[i],
-                srcStates[i],
+                srcState,
                 dstSizes[i]);
 
             offset += dstSizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index f888f0a55ac48..4a65ce899d791 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -28,7 +28,7 @@ namespace Dml
             gsl::span<const uint32_t > dstSizes,
             gsl::span<ID3D12Resource*> src,
             gsl::span<uint64_t> srcOffsets,
-            gsl::span<const D3D12_RESOURCE_STATES> srcStates);
+            D3D12_RESOURCE_STATES srcState);
 
     private:
         void EnsureReadbackHeap(size_t size);

From 7440e74dd34f5841b0860f42f4a0f02c0ca01e98 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 12 Jul 2023 13:23:44 -0700
Subject: [PATCH 46/76] Fix mish test failure

---
 .../src/DmlHeapAllocation.h                   |  11 --
 .../src/DmlReservedResourceSubAllocator.cpp   | 107 ++++-------
 .../src/ExecutionContext.cpp                  | 169 +++++++++---------
 3 files changed, 120 insertions(+), 167 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
index 6de78a47b6d8b..ab75b7d322120 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
@@ -13,17 +13,6 @@ namespace Dml
         // an allocation may comprise multiple heaps. If tiling is not supported
         // an allocation will only have a single heap.
         std::vector<Microsoft::WRL::ComPtr<ID3D12Heap>> heaps;
-
-        // Resources created over this allocation's heaps. All three resources
-        // are identical aside from being fixed in a single resource state: UAV,
-        // COPY_SRC, and COPY_DST respectively. The purpose of duplicate
-        // resources is to enable overlapping resources in different states for
-        // copying data. Most callers will not (and should not) interact
-        // directly with these resources; all three are wrapped by the buffer
-        // regions returned from this allocator, and the appropriate resource
-        // will be used automatically when performing buffer copies.
         Microsoft::WRL::ComPtr<ID3D12Resource> resource_uav_state;
-        Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_src_state;
-        Microsoft::WRL::ComPtr<ID3D12Resource> resource_copy_dst_state;
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index 0dc07384ea905..bc9a0b15e86fe 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -95,41 +95,24 @@ namespace Dml
 
         // The allocation may be larger than the requested size to ensure a whole
         // number of tiles.
-        const uint64_t resource_size_in_tiles =
-            1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-        const uint64_t resource_size_in_bytes =
-            resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-        auto resource_desc =
-            CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_);
+        const uint64_t resource_size_in_tiles = 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        const uint64_t resource_size_in_bytes = resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        auto resource_desc = CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_);
 
-        ID3D12Resource** resources[] = {
-            &allocation.resource_uav_state,
-            &allocation.resource_copy_src_state,
-            &allocation.resource_copy_dst_state};
-
-        D3D12_RESOURCE_STATES states[] = {
+        HRESULT create_resource_hr = m_device->CreateReservedResource(
+            &resource_desc,
             initial_state_,
-            D3D12_RESOURCE_STATE_COPY_SOURCE,
-            D3D12_RESOURCE_STATE_COPY_DEST};
+            nullptr,
+            IID_PPV_ARGS(&allocation.resource_uav_state));
 
-        for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
+        if (create_resource_hr == E_OUTOFMEMORY)
         {
-            HRESULT create_resource_hr = m_device->CreateReservedResource(
-                &resource_desc,
-                states[i],
-                nullptr,
-                IID_PPV_ARGS(resources[i]));
-
-            if (create_resource_hr == E_OUTOFMEMORY)
-            {
-                return absl::nullopt;
-            }
-            ORT_THROW_IF_FAILED(create_resource_hr);
+            return absl::nullopt;
         }
+        ORT_THROW_IF_FAILED(create_resource_hr);
 
         // Reserve enough heaps to store all tiles in the resource.
-        const uint64_t heap_count =
-            1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_;
+        const uint64_t heap_count = 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_;
         allocation.heaps.resize(heap_count);
 
         // Create heaps and map them to the primary reserved resource.
@@ -175,28 +158,17 @@ namespace Dml
             // guaranteed to be set (on the GPU timeline) by the time any code can
             // reference the returned resource. We only execute operations on a
             // single hardware queue so there is no need to wait or signal.
-            //
-            // All resources have identical tile mappings. The repeated call to
-            // UpdateTileMappings on all resources instead of using CopyTileMappings
-            // is intentional: the latter API is not supported by all versions of
-            // PIX.
-            for (auto resource :
-                {allocation.resource_uav_state.Get(),
-                allocation.resource_copy_src_state.Get(),
-                allocation.resource_copy_dst_state.Get()})
-            {
-                queue_->UpdateTileMappings(
-                    resource,
-                    numResourceRegions,
-                    &resource_region_start_coordinates,
-                    &resource_region_size,
-                    allocation.heaps[i].Get(),
-                    numHeapRanges,
-                    &tile_range_flags,
-                    &heap_range_start_offset,
-                    &heap_range_tile_count,
-                    D3D12_TILE_MAPPING_FLAG_NONE);
-            }
+            queue_->UpdateTileMappings(
+                allocation.resource_uav_state.Get(),
+                numResourceRegions,
+                &resource_region_start_coordinates,
+                &resource_region_size,
+                allocation.heaps[i].Get(),
+                numHeapRanges,
+                &tile_range_flags,
+                &heap_range_start_offset,
+                &heap_range_tile_count,
+                D3D12_TILE_MAPPING_FLAG_NONE);
 
             resource_region_start_coordinates.X += static_cast<uint32_t>(heap_size_in_tiles);
             unmapped_resource_tiles -= heap_size_in_tiles;
@@ -225,33 +197,20 @@ namespace Dml
         }
 
         // Create large placed resource that spans the heap.
-        D3D12_RESOURCE_DESC resource_desc =
-            CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_);
-
-        ID3D12Resource** resources[] = {
-            &allocation.resource_uav_state,
-            &allocation.resource_copy_src_state,
-            &allocation.resource_copy_dst_state};
-        D3D12_RESOURCE_STATES states[] = {
-            initial_state_,
-            D3D12_RESOURCE_STATE_COPY_SOURCE,
-            D3D12_RESOURCE_STATE_COPY_DEST};
+        D3D12_RESOURCE_DESC resource_desc = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_);
 
-        for (int i = 0; i < ABSL_ARRAYSIZE(resources); i++)
+        HRESULT create_resource_hr = m_device->CreatePlacedResource(
+            allocation.heaps.front().Get(),
+            0,
+            &resource_desc,
+            initial_state_,
+            nullptr,
+            IID_PPV_ARGS(&allocation.resource_uav_state));
+        if (create_resource_hr == E_OUTOFMEMORY)
         {
-            HRESULT create_resource_hr = m_device->CreatePlacedResource(
-                allocation.heaps.front().Get(),
-                0,
-                &resource_desc,
-                states[i],
-                nullptr,
-                IID_PPV_ARGS(resources[i]));
-            if (create_resource_hr == E_OUTOFMEMORY)
-            {
-                return absl::nullopt;
-            }
-            ORT_THROW_IF_FAILED(create_resource_hr);
+            return absl::nullopt;
         }
+        ORT_THROW_IF_FAILED(create_resource_hr);
 
         return allocation;
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index b5492a1a86ea3..9a180f64e49db 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -37,104 +37,109 @@ namespace Dml
 
         SetCommandRecorder(&m_dmlRecorder);
 
-        // This type of copy is not common and is only used in rare circumstances. Because a resource
-        // cannot be both in a source and destination state at the same time (without aliasing), we copy
-        // the source resource to an intermediate one, and then copy the intermediate resource to the
-        // destination resource.
-        // TODO (pavignol): Only do the intermediate copy when both resources at the same
-
-        D3D12_HEAP_PROPERTIES heapProperties = {
-            D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
-
-        D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER,
-                                            0,
-                                            byteCount,
-                                            1,
-                                            1,
-                                            1,
-                                            DXGI_FORMAT_UNKNOWN,
-                                            {1, 0},
-                                            D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
-                                            D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
-
-        ComPtr<ID3D12Resource> intermediateBuffer;
-        ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource(
-            &heapProperties,
-            D3D12_HEAP_FLAG_NONE,
-            &resourceDesc,
-            D3D12_RESOURCE_STATE_COPY_DEST,
-            nullptr,
-            IID_GRAPHICS_PPV_ARGS(intermediateBuffer.GetAddressOf())));
-
-        std::vector<D3D12_RESOURCE_BARRIER> barriers;
-
-        if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE))
+        if (dstBuffer == srcBuffer)
         {
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE));
-            m_dmlRecorder.ResourceBarrier(barriers);
-        }
-
-        m_dmlRecorder.CopyBufferRegion(intermediateBuffer.Get(), 0, srcBuffer, srcOffset, byteCount);
-
-        // Reset src barrier state
-        for (auto& barrier : barriers)
-        {
-            std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
-        }
-
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(intermediateBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE));
+            // This type of copy is not common and is only used in rare circumstances. Because a resource
+            // cannot be both in a source and destination state at the same time (without aliasing), we copy
+            // the source resource to an intermediate one, and then copy the intermediate resource to the
+            // destination resource.
+            // TODO (pavignol): Only do the intermediate copy when both resources at the same
+
+            D3D12_HEAP_PROPERTIES heapProperties = {
+                D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+
+            D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER,
+                                                0,
+                                                byteCount,
+                                                1,
+                                                1,
+                                                1,
+                                                DXGI_FORMAT_UNKNOWN,
+                                                {1, 0},
+                                                D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                                                D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
+
+            ComPtr<ID3D12Resource> intermediateBuffer;
+            ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource(
+                &heapProperties,
+                D3D12_HEAP_FLAG_NONE,
+                &resourceDesc,
+                D3D12_RESOURCE_STATE_COPY_DEST,
+                nullptr,
+                IID_GRAPHICS_PPV_ARGS(intermediateBuffer.GetAddressOf())));
+
+            std::vector<D3D12_RESOURCE_BARRIER> barriers;
+
+            if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE))
+            {
+                barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE));
+                m_dmlRecorder.ResourceBarrier(barriers);
+            }
 
-        if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
-        {
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST));
-        }
+            m_dmlRecorder.CopyBufferRegion(intermediateBuffer.Get(), 0, srcBuffer, srcOffset, byteCount);
 
-        m_dmlRecorder.ResourceBarrier(barriers);
-        m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount);
+            // Reset src barrier state
+            for (auto& barrier : barriers)
+            {
+                std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
+            }
 
-        barriers.clear();
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(intermediateBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE));
 
-        // Reset dst barrier state
-        if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
-        {
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
-        }
+            if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
+            {
+                barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST));
+            }
 
-        m_dmlRecorder.ResourceBarrier(barriers);
+            m_dmlRecorder.ResourceBarrier(barriers);
+            m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount);
 
-        // Keep the intermediate buffer alive until we're done with it
-        QueueReference(intermediateBuffer.Get());
+            barriers.clear();
 
+            // Reset dst barrier state
+            if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
+            {
+                barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
+            }
 
-/*
-        std::vector<D3D12_RESOURCE_BARRIER> barriers;
+            // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
+            m_dmlRecorder.ResourceBarrier(barriers);
 
-        if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
-        {
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST));
+            // Keep the intermediate buffer alive until we're done with it
+            QueueReference(intermediateBuffer.Get());
         }
-        if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE))
+        else
         {
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE));
-        }
+            std::vector<D3D12_RESOURCE_BARRIER> barriers;
 
-        if (!barriers.empty())
-        {
-            m_dmlRecorder.ResourceBarrier(barriers);
-        }
+            if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
+            {
+                barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, dstState, D3D12_RESOURCE_STATE_COPY_DEST));
+            }
+            if (!(srcState & D3D12_RESOURCE_STATE_COPY_SOURCE))
+            {
+                barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(srcBuffer, srcState, D3D12_RESOURCE_STATE_COPY_SOURCE));
+            }
 
-        m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount);
+            if (!barriers.empty())
+            {
+                m_dmlRecorder.ResourceBarrier(barriers);
+            }
 
-        // Reset barrier state
-        for (auto& barrier : barriers)
-        {
-            std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
-        }
+            m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, srcBuffer, srcOffset, byteCount);
 
-        // Since this copy may write to GPU memory, we also need to perform an
-        // aliasing barrier
-        m_dmlRecorder.ResourceBarrier(barriers);
-*/
+            // Reset barrier state
+            for (auto& barrier : barriers)
+            {
+                std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
+            }
+
+            // Since this copy may write to GPU memory, we also need to perform an
+            // aliasing barrier
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
+            m_dmlRecorder.ResourceBarrier(barriers);
+        }
     }
 
     void ExecutionContext::FillBufferWithPattern(

From b2e65fc3479883a8a5371ac9b13d14b4a9343fec Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 12 Jul 2023 15:31:33 -0700
Subject: [PATCH 47/76] Remove rest of Aliasing

---
 .../src/ExecutionContext.cpp                     | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index 9a180f64e49db..4ff464e0eef42 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -94,18 +94,14 @@ namespace Dml
             m_dmlRecorder.ResourceBarrier(barriers);
             m_dmlRecorder.CopyBufferRegion(dstBuffer, dstOffset, intermediateBuffer.Get(), 0, byteCount);
 
-            barriers.clear();
-
             // Reset dst barrier state
             if (!(dstState & D3D12_RESOURCE_STATE_COPY_DEST))
             {
+                barriers.clear();
                 barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(dstBuffer, D3D12_RESOURCE_STATE_COPY_DEST, dstState));
+                m_dmlRecorder.ResourceBarrier(barriers);
             }
 
-            // Since this copy may write to GPU memory, we also need to perform an aliasing barrier
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
-            m_dmlRecorder.ResourceBarrier(barriers);
-
             // Keep the intermediate buffer alive until we're done with it
             QueueReference(intermediateBuffer.Get());
         }
@@ -135,10 +131,10 @@ namespace Dml
                 std::swap(barrier.Transition.StateBefore, barrier.Transition.StateAfter);
             }
 
-            // Since this copy may write to GPU memory, we also need to perform an
-            // aliasing barrier
-            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Aliasing(nullptr, nullptr));
-            m_dmlRecorder.ResourceBarrier(barriers);
+            if (!barriers.empty())
+            {
+                m_dmlRecorder.ResourceBarrier(barriers);
+            }
         }
     }
 

From d5be4f14154583497d0f33b9c97b61f90ab96219 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 12 Jul 2023 23:46:59 -0700
Subject: [PATCH 48/76] Add BFC allocator

---
 .../inc/DmlExecutionProvider.h                |   3 +-
 .../inc/IWinmlExecutionProvider.h             |   6 +-
 .../src/AbiCustomRegistry.cpp                 |   6 +
 .../src/AbiCustomRegistry.h                   |  12 +-
 .../src/BucketizedBufferAllocator.cpp         | 250 ++++++++++++++++++
 .../src/BucketizedBufferAllocator.h           |  90 +++++++
 .../src/DmlAllocationInfo.cpp                 |   3 +-
 .../src/DmlAllocationInfo.h                   |  20 +-
 .../DmlExecutionProvider/src/DmlBuffer.cpp    |   3 +-
 .../src/DmlCommittedResourceWrapper.h         |   2 +-
 .../src/DmlGpuAllocator.cpp                   |  83 +++++-
 .../src/DmlGpuAllocator.h                     |  26 +-
 .../src/DmlGraphFusionHelper.cpp              |   7 +-
 .../src/DmlReservedResourceSubAllocator.cpp   |  23 +-
 .../src/DmlReservedResourceSubAllocator.h     |  11 +-
 .../src/DmlReservedResourceWrapper.h          |   2 +-
 .../src/DmlResourceWrapper.h                  |   2 +-
 .../src/DmlSubAllocator.h                     |  15 ++
 .../src/ExecutionProvider.cpp                 |  41 ++-
 .../src/ExecutionProvider.h                   |  14 +-
 .../src/MLOperatorAuthorImpl.cpp              |   5 +-
 .../MLOperatorAuthorPrivate.h                 |   2 +
 .../providers/dml/dml_provider_factory.cc     |   6 +-
 23 files changed, 563 insertions(+), 69 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index 5f9c4394e740f..ef012855770f3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -31,10 +31,11 @@ namespace Dml
         ID3D12CommandQueue* commandQueue,
         bool enableMetacommands = true);
 
-    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
+    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t size_in_bytes);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
+    void DisableBfcAllocator(onnxruntime::IExecutionProvider* provider);
 
     onnxruntime::common::Status CopyTensor(
         onnxruntime::IExecutionProvider* provider,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 0daee39da337b..9f4ec88083b7c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -40,11 +40,9 @@ namespace Windows::AI::MachineLearning::Adapter
         // the provider's underlying queues.
         virtual void QueueReference(IUnknown *object) = 0;
 
-        virtual Dml::D3D12BufferRegion GetBufferRegion(const Dml::TaggedPointer& taggedPointer, uint64_t size) const = 0;
+        virtual Dml::D3D12BufferRegion GetBufferRegion(void* opaquePointer, uint64_t size) const = 0;
 
-        virtual uint64_t TryGetPooledAllocationId(
-            const Dml::TaggedPointer& taggedPointer,
-            bool isInternalOperator) = 0;
+        virtual uint64_t GetUniqueId(void* opaquePointer) = 0;
 
         virtual void GetABIExecutionInterfaceAndInvalidateState(
             bool isInternalOperator,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
index ede3e7f2c2257..d3de2fbfe31d3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -561,6 +561,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
         //
         // For backward compatibility, this does not propagate errors for external operators
         static_cast<void>(m_kernelRegistry->RegisterCustomKernel(create_info));  // ignore result
+        m_hasExternalOperators = true;
     }
 
     return S_OK;
@@ -568,4 +569,9 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
     ORT_CATCH_RETURN
 }
 
+bool STDMETHODCALLTYPE AbiCustomRegistry::HasExternalOperators() const noexcept
+{
+    return m_hasExternalOperators;
+}
+
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h
index d6b1448b559b1..926eb02b44918 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h
@@ -15,7 +15,7 @@ namespace WRL
 }
 
 namespace Windows::AI::MachineLearning::Adapter
-{ 
+{
 
 using namespace Microsoft::WRL;
 
@@ -49,6 +49,8 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
         IMLOperatorKernelFactory* operatorKernelFactory,
         _In_opt_ IMLOperatorShapeInferrer* shapeInferrer) const noexcept override;
 
+    bool STDMETHODCALLTYPE HasExternalOperators() const noexcept override;
+
     std::list<std::shared_ptr<onnxruntime::CustomRegistry>> GetRegistries()
     {
         std::list<std::shared_ptr<onnxruntime::CustomRegistry>> registries;
@@ -56,7 +58,7 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
         {
             registries.push_back(registry.second);
         }
-        
+
         registries.push_back(m_kernelRegistry);
 
         return registries;
@@ -86,7 +88,7 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
 
  private:
     static onnx::OpSchema ConvertOpSchema(
-        _In_z_ const char* domain, 
+        _In_z_ const char* domain,
         const MLOperatorSchemaDescription& abiSchema,
         IMLOperatorTypeInferrer* typeInferrer,
         IMLOperatorShapeInferrer* shapeInferrer);
@@ -94,7 +96,7 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
     static std::string ConvertFormalParameterType(const MLOperatorSchemaEdgeDescription& formalParameter);
     static onnx::OpSchema::FormalParameterOption ConvertFormalParameterOption(MLOperatorParameterOptions options);
     static void SetAttributesAndDefaults(onnx::OpSchema& schema, const MLOperatorSchemaDescription& abiSchema);
-    
+
     static AttributeMap GetDefaultAttributes(const MLOperatorKernelDescription* opKernel);
 
     std::shared_ptr<onnxruntime::CustomRegistry> m_kernelRegistry;
@@ -107,6 +109,8 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
     // Map between Lotus KernelDefs and extended data used during partitioning
     mutable std::shared_ptr<InternalRegistrationInfoMap> m_internalRegInfoMap;
 
+    mutable bool m_hasExternalOperators = false;
+
 };
 
 }    // namespace Windows::AI::MachineLearning::Adapter
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
new file mode 100644
index 0000000000000..c071e4bf0b8d3
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -0,0 +1,250 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+#include "core/session/onnxruntime_c_api.h"
+
+#include "BucketizedBufferAllocator.h"
+#include "DmlAllocationInfo.h"
+#include "DmlCommittedResourceWrapper.h"
+// #define PRINT_OUTSTANDING_ALLOCATIONS
+
+namespace Dml
+{
+    BucketizedBufferAllocator::~BucketizedBufferAllocator()
+    {
+#ifdef PRINT_OUTSTANDING_ALLOCATIONS
+        if (!m_outstandingAllocationsById.empty())
+        {
+            printf("BucketizedBufferAllocator outstanding allocation indices:\n");
+            for (auto& entry : m_outstandingAllocationsById)
+            {
+                printf("%u\n", static_cast<int>(entry.first));
+            }
+            printf("\n");
+        }
+#endif
+    }
+
+    BucketizedBufferAllocator::BucketizedBufferAllocator(
+        ID3D12Device* device,
+        std::shared_ptr<ExecutionContext> context,
+        const D3D12_HEAP_PROPERTIES& heapProps,
+        D3D12_HEAP_FLAGS heapFlags,
+        D3D12_RESOURCE_FLAGS resourceFlags,
+        D3D12_RESOURCE_STATES initialState)
+        : onnxruntime::IAllocator(
+            OrtMemoryInfo(
+                "DML",
+                OrtAllocatorType::OrtDeviceAllocator,
+                OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)
+            )
+        ),
+        m_device(device),
+        m_heapProperties(heapProps),
+        m_heapFlags(heapFlags),
+        m_resourceFlags(resourceFlags),
+        m_initialState(initialState),
+        m_context(context)
+    {
+    }
+
+    /*static*/ gsl::index BucketizedBufferAllocator::GetBucketIndexFromSize(uint64_t size)
+    {
+        assert(size != 0);
+
+        // Each bucket is twice as large as the previous one, in ascending order
+        gsl::index index = static_cast<gsl::index>(ceil(log2(size)));
+        assert((1ull << index) >= size); // This must be true unless there were some strange rounding issues
+
+        // The smallest bucket is 2^n bytes large, where n = c_minResourceSizeExponent
+        index = std::max<gsl::index>(index, c_minResourceSizeExponent);
+        index -= c_minResourceSizeExponent;
+
+        return index;
+    }
+
+    /*static*/ uint64_t BucketizedBufferAllocator::GetBucketSizeFromIndex(gsl::index index)
+    {
+        return (1ull << (index + c_minResourceSizeExponent));
+    }
+
+    ComPtr<DmlResourceWrapper> BucketizedBufferAllocator::AllocCommittedResource(size_t size)
+    {
+        ComPtr<ID3D12Resource> resource;
+        auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+        ORT_THROW_IF_FAILED(m_device->CreateCommittedResource(
+            &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+            D3D12_HEAP_FLAG_NONE,
+            &buffer,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            nullptr,
+            IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf())
+        ));
+
+        ComPtr<DmlResourceWrapper> resourceWrapper;
+        wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
+        return resourceWrapper;
+    }
+
+    AllocationInfo* BucketizedBufferAllocator::GetAllocationInfo(void* opaquePointer)
+    {
+        return static_cast<AllocationInfo*>(opaquePointer);
+    }
+
+    D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const
+    {
+        auto allocationInfo = static_cast<AllocationInfo*>(opaquePointer);
+        return D3D12BufferRegion(0, size_in_bytes, allocationInfo->GetD3D12Resource());
+    }
+
+    void* BucketizedBufferAllocator::Alloc(size_t size)
+    {
+        // For some reason lotus likes requesting 0 bytes of memory
+        size = std::max<size_t>(1, size);
+
+        ComPtr<DmlResourceWrapper> resourceWrapper;
+        uint64_t resourceId = 0;
+        uint64_t bucketSize = 0;
+
+        // Use a pooled resource if the size (post rounding, if requested) matches a bucket size
+        if (m_defaultRoundingMode == AllocatorRoundingMode::Enabled || size == GetBucketSizeFromIndex(GetBucketIndexFromSize(size)))
+        {
+            Bucket* bucket = nullptr;
+
+            // Find the bucket for this allocation size
+            gsl::index bucketIndex = GetBucketIndexFromSize(size);
+
+            if (gsl::narrow_cast<gsl::index>(m_pool.size()) <= bucketIndex)
+            {
+                // Ensure there are sufficient buckets
+                m_pool.resize(bucketIndex + 1);
+            }
+
+            bucket = &m_pool[bucketIndex];
+            bucketSize = GetBucketSizeFromIndex(bucketIndex);
+
+            if (bucket->resources.empty())
+            {
+                // No more resources in this bucket - allocate a new one
+                resourceWrapper = AllocCommittedResource(onnxruntime::narrow<size_t>(bucketSize));
+                resourceId = ++m_currentResourceId;
+            }
+            else
+            {
+                // Retrieve a resource from the bucket
+                resourceWrapper = std::move(bucket->resources.back().resource);
+                resourceId = bucket->resources.back().resourceId;
+                bucket->resources.pop_back();
+            }
+        }
+        else
+        {
+            // The allocation will not be pooled.  Construct a new one
+            bucketSize = (size + 3) & ~3;
+            resourceWrapper = AllocCommittedResource(onnxruntime::narrow<size_t>(bucketSize));
+            resourceId = ++m_currentResourceId;
+        }
+
+        assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize);
+        assert(resourceWrapper != nullptr);
+
+        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
+            this,
+            ++m_currentAllocationId,
+            resourceId,
+            resourceWrapper.Get(),
+            size
+        );
+
+    #if _DEBUG
+        m_outstandingAllocationsById[allocInfo->GetId()] = allocInfo.Get();
+    #endif
+
+        return allocInfo.Detach();
+    }
+
+    void BucketizedBufferAllocator::Free(void* p)
+    {
+        // Release Lotus's reference on the allocation.  The allocation
+        // also inherits IUnknown, and once its final reference reaches zero
+        // it will call FreeResource
+        ComPtr<AllocationInfo> allocInfo;
+        allocInfo.Attach(static_cast<AllocationInfo*>(p));
+    }
+
+    uint64_t BucketizedBufferAllocator::GetUniqueId(void* opaquePointer)
+    {
+        const auto* allocInfo = static_cast<const AllocationInfo*>(opaquePointer);
+        return allocInfo->GetPooledResourceId();
+    }
+
+    void BucketizedBufferAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t pooledResourceId)
+    {
+        assert(allocInfo != nullptr); // Can't free nullptr
+
+        if (allocInfo->GetOwner() != this)
+        {
+            // This allocation doesn't belong to this allocator!
+            ORT_THROW_HR(E_INVALIDARG);
+        }
+
+        // Free the resource to the pool if its size matches a bucket size
+        gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
+        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetD3D12Resource()->GetDesc().Width)
+        {
+            assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
+
+            // Return the resource to the bucket
+            Bucket* bucket = &m_pool[bucketIndex];
+
+            Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
+            bucket->resources.push_back(resource);
+        }
+        else
+        {
+            // Free the underlying allocation once queued work has completed.
+#ifdef _GAMING_XBOX
+            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetD3D12Resource()).Get());
+#else
+            m_context->QueueReference(allocInfo->GetD3D12Resource());
+#endif
+            allocInfo->DetachResourceWrapper();
+        }
+
+    #if _DEBUG
+        assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
+        m_outstandingAllocationsById.erase(allocInfo->GetId());
+    #endif
+
+        // The allocation info is already destructing at this point
+    }
+
+
+    const AllocationInfo* BucketizedBufferAllocator::DecodeDataHandle(const void* opaqueHandle)
+    {
+        if (opaqueHandle == nullptr)
+        {
+            // There is no memory allocated which needs to be decoded.
+            ORT_THROW_HR(E_INVALIDARG);
+        }
+        const auto* allocInfo = static_cast<const AllocationInfo*>(opaqueHandle);
+
+        auto owner = allocInfo->GetOwner();
+        //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource
+        if (owner != nullptr && owner != this)
+        {
+            // This allocation doesn't belong to this allocator!
+            ORT_THROW_HR(E_INVALIDARG);
+        }
+
+        return allocInfo;
+    }
+
+    void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
+    {
+        m_defaultRoundingMode = roundingMode;
+    }
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
new file mode 100644
index 0000000000000..f0fc570d4e1c4
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+#include "ExecutionContext.h"
+#include "DmlResourceWrapper.h"
+#include "DmlSubAllocator.h"
+
+namespace Dml
+{
+    class BucketizedBufferAllocator;
+
+    // Implements a Lotus allocator for D3D12 heap buffers, using a bucket allocation strategy. The allocator
+    // maintains a set of fixed-size buckets, with each bucket containing one or more D3D12 buffers of that fixed size.
+    // All requested allocation sizes are rounded up to the nearest bucket size, which ensures minimal fragmentation
+    // while providing an upper bound on the amount of memory "wasted" with each allocation.
+    class BucketizedBufferAllocator : public onnxruntime::IAllocator, public DmlSubAllocator
+    {
+    public:
+        ~BucketizedBufferAllocator();
+
+        // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties,
+        // resource flags, and initial resource state.
+        BucketizedBufferAllocator(
+            ID3D12Device* device,
+            std::shared_ptr<ExecutionContext> context,
+            const D3D12_HEAP_PROPERTIES& heapProps,
+            D3D12_HEAP_FLAGS heapFlags,
+            D3D12_RESOURCE_FLAGS resourceFlags,
+            D3D12_RESOURCE_STATES initialState);
+
+        ComPtr<DmlResourceWrapper> AllocCommittedResource(size_t size);
+
+        // Returns the information associated with an opaque allocation handle returned by IAllocator::Alloc.
+        const AllocationInfo* DecodeDataHandle(const void* opaqueHandle);
+
+        void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
+
+        AllocationInfo* GetAllocationInfo(void* opaquePointer);
+        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const;
+        uint64_t GetUniqueId(void* opaquePointer);
+
+    public: // onnxruntime::IAllocator
+        void* Alloc(size_t size) final;
+        void Free(void* p) final;
+
+    private:
+        static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
+
+        // The pool consists of a number of buckets, and each bucket contains a number of resources of the same size.
+        // The resources in each bucket are always sized as a power of two, and each bucket contains resources twice
+        // as large as the previous bucket.
+        struct Resource
+        {
+            ComPtr<DmlResourceWrapper> resource;
+            uint64_t resourceId;
+        };
+
+        struct Bucket
+        {
+            std::vector<Resource> resources;
+        };
+
+        static gsl::index GetBucketIndexFromSize(uint64_t size);
+        static uint64_t GetBucketSizeFromIndex(gsl::index index);
+
+        friend class AllocationInfo;
+        void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) final;
+
+        ComPtr<ID3D12Device> m_device;
+        D3D12_HEAP_PROPERTIES m_heapProperties;
+        D3D12_HEAP_FLAGS m_heapFlags;
+        D3D12_RESOURCE_FLAGS m_resourceFlags;
+        D3D12_RESOURCE_STATES m_initialState;
+
+        std::vector<Bucket> m_pool;
+        size_t m_currentAllocationId = 0;
+        uint64_t m_currentResourceId = 0;
+        AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled;
+        std::shared_ptr<ExecutionContext> m_context;
+
+    #if _DEBUG
+        // Useful for debugging; keeps track of all allocations that haven't been freed yet
+        std::map<size_t, AllocationInfo*> m_outstandingAllocationsById;
+    #endif
+    };
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
index 52944d2c8b96a..5db1289778819 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.cpp
@@ -4,6 +4,7 @@
 #include "precomp.h"
 #include "DmlAllocationInfo.h"
 #include "DmlReservedResourceSubAllocator.h"
+#include "DmlSubAllocator.h"
 
 namespace Dml
 {
@@ -12,7 +13,7 @@ namespace Dml
     {
         if (m_owner)
         {
-            m_owner->FreeResource(this);
+            m_owner->FreeResource(this, m_pooledResourceId);
         }
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index 7c11358bb106d..ee203ba47056e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -7,33 +7,35 @@
 
 namespace Dml
 {
-    class DmlReservedResourceSubAllocator;
+    class DmlSubAllocator;
 
     class AllocationInfo : public Microsoft::WRL::RuntimeClass<
         Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, IUnknown>
     {
     public:
         AllocationInfo(
-            DmlReservedResourceSubAllocator* owner,
+            DmlSubAllocator* owner,
             size_t id,
+            uint64_t pooledResourceId,
             DmlResourceWrapper* resourceWrapper,
             size_t requestedSize)
             : m_owner(owner)
             , m_allocationId(id)
+            , m_pooledResourceId(pooledResourceId)
             , m_resourceWrapper(resourceWrapper)
             , m_requestedSize(requestedSize)
         {}
 
         ~AllocationInfo();
 
-        DmlReservedResourceSubAllocator* GetOwner() const
+        DmlSubAllocator* GetOwner() const
         {
             return m_owner;
         }
 
-        ID3D12Resource* GetUavResource() const
+        ID3D12Resource* GetD3D12Resource() const
         {
-            return m_resourceWrapper->GetUavResource();
+            return m_resourceWrapper->GetD3D12Resource();
         }
 
         ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
@@ -51,9 +53,15 @@ namespace Dml
             return m_allocationId;
         }
 
+        uint64_t GetPooledResourceId() const
+        {
+            return m_pooledResourceId;
+        }
+
     private:
-        DmlReservedResourceSubAllocator* m_owner;
+        DmlSubAllocator* m_owner;
         size_t m_allocationId; // For debugging purposes
+        uint64_t m_pooledResourceId;
         Microsoft::WRL::ComPtr<DmlResourceWrapper> m_resourceWrapper;
 
         // The size requested during Alloc(), which may be smaller than the physical resource size
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index 464ce26c16f54..0b670a22f9cbd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -4,7 +4,6 @@
 #include "precomp.h"
 #include "DmlBuffer.h"
 #include "DmlGpuAllocator.h"
-#include "DmlTaggedPointer.h"
 
 namespace Dml
 {
@@ -15,7 +14,7 @@ namespace Dml
     m_opaqueData = allocator_->Alloc(size_in_bytes);
     ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr);
 
-    buffer_region_ = allocator_->CreateBufferRegion(TaggedPointer::Unpack(m_opaqueData), size_in_bytes);
+    buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes);
 }
 
 DmlBuffer::~DmlBuffer()
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
index 4b9c167dfe671..73454d5d0dee0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceWrapper.h
@@ -11,7 +11,7 @@ namespace Dml
         DmlCommittedResourceWrapper(ComPtr<ID3D12Resource>&& d3d12Resource) : m_d3d12Resource(std::move(d3d12Resource)) {}
 
         // Committed resources use the same resource for all states and use barriers to transition between states
-        ID3D12Resource* GetUavResource() const final { return m_d3d12Resource.Get(); }
+        ID3D12Resource* GetD3D12Resource() const final { return m_d3d12Resource.Get(); }
 
     private:
         ComPtr<ID3D12Resource> m_d3d12Resource;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index f2b62f2d41e64..e2606433ec5b2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -8,10 +8,15 @@
 #include "core/framework/allocator.h"
 #include "DmlReservedResourceSubAllocator.h"
 #include "DmlTaggedPointer.h"
+#include "DmlAllocationInfo.h"
+#include "BucketizedBufferAllocator.h"
 
 namespace Dml
 {
-    DmlGpuAllocator::DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator)
+    DmlGpuAllocator::DmlGpuAllocator(
+        onnxruntime::IAllocator* bfcAllocator,
+        BucketizedBufferAllocator* bucketizedBufferAllocator,
+        std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator)
     : onnxruntime::IAllocator(
         OrtMemoryInfo(
             onnxruntime::DML,
@@ -21,31 +26,73 @@ namespace Dml
         )
     ),
     m_bfcAllocator(bfcAllocator),
-    m_subAllocator(subAllocator) {}
+    m_bucketizedBufferAllocator(bucketizedBufferAllocator),
+    m_bfcSubAllocator(bfcSubAllocator),
+    m_activeAllocator(ActiveAllocator::BfcAllocator) {}
 
     void* DmlGpuAllocator::Alloc(size_t size_in_bytes)
     {
-        return m_bfcAllocator->Alloc(size_in_bytes);
+        switch(m_activeAllocator)
+        {
+        case ActiveAllocator::BfcAllocator:
+            return m_bfcAllocator->Alloc(size_in_bytes);
+        case ActiveAllocator::BucketizedBufferAllocator:
+            return m_bucketizedBufferAllocator->Alloc(size_in_bytes);
+        default:
+            ORT_THROW_HR(E_UNEXPECTED);
+        }
     }
 
     void DmlGpuAllocator::Free(void* ptr)
     {
-        m_bfcAllocator->Free(ptr);
+        switch(m_activeAllocator)
+        {
+        case ActiveAllocator::BfcAllocator:
+            return m_bfcAllocator->Free(ptr);
+        case ActiveAllocator::BucketizedBufferAllocator:
+            return m_bucketizedBufferAllocator->Free(ptr);
+        default:
+            ORT_THROW_HR(E_UNEXPECTED);
+        }
     }
 
-    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes)
+    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes)
     {
-        return m_subAllocator->CreateBufferRegion(taggedPointer, size_in_bytes);
+        switch(m_activeAllocator)
+        {
+        case ActiveAllocator::BfcAllocator:
+            return m_bfcSubAllocator->CreateBufferRegion(opaquePointer, size_in_bytes);
+        case ActiveAllocator::BucketizedBufferAllocator:
+            return m_bucketizedBufferAllocator->CreateBufferRegion(opaquePointer, size_in_bytes);
+        default:
+            ORT_THROW_HR(E_UNEXPECTED);
+        }
     }
 
-    AllocationInfo* DmlGpuAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
+    AllocationInfo* DmlGpuAllocator::GetAllocationInfo(void* opaquePointer)
     {
-        return m_subAllocator->GetAllocationInfo(taggedPointer);
+        switch(m_activeAllocator)
+        {
+        case ActiveAllocator::BfcAllocator:
+            return m_bfcSubAllocator->GetAllocationInfo(opaquePointer);
+        case ActiveAllocator::BucketizedBufferAllocator:
+            return m_bucketizedBufferAllocator->GetAllocationInfo(opaquePointer);
+        default:
+            ORT_THROW_HR(E_UNEXPECTED);
+        }
     }
 
     void DmlGpuAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        m_subAllocator->SetDefaultRoundingMode(roundingMode);
+        switch(m_activeAllocator)
+        {
+        case ActiveAllocator::BfcAllocator:
+            m_bfcSubAllocator->SetDefaultRoundingMode(roundingMode);
+        case ActiveAllocator::BucketizedBufferAllocator:
+            m_bucketizedBufferAllocator->SetDefaultRoundingMode(roundingMode);
+        default:
+            ORT_THROW_HR(E_UNEXPECTED);
+        }
     }
 
     DmlBuffer DmlGpuAllocator::AllocateDefaultBuffer(uint64_t num_bytes)
@@ -53,4 +100,22 @@ namespace Dml
         return DmlBuffer(this, num_bytes);
     }
 
+    uint64_t DmlGpuAllocator::GetUniqueId(void* opaquePointer)
+    {
+        switch(m_activeAllocator)
+        {
+        case ActiveAllocator::BfcAllocator:
+            return m_bfcSubAllocator->GetUniqueId(opaquePointer);
+        case ActiveAllocator::BucketizedBufferAllocator:
+            return m_bucketizedBufferAllocator->GetUniqueId(opaquePointer);
+        default:
+            ORT_THROW_HR(E_UNEXPECTED);
+        }
+    }
+
+    void DmlGpuAllocator::SetActiveAllocator(ActiveAllocator activeAllocator)
+    {
+        m_activeAllocator = activeAllocator;
+    }
+
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index 39311055503d2..955c9ca10c7d2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -10,28 +10,46 @@
 namespace Dml
 {
     class DmlReservedResourceSubAllocator;
+    class BucketizedBufferAllocator;
     class AllocationInfo;
     struct TaggedPointer;
 
+    enum class ActiveAllocator
+    {
+        BfcAllocator,
+        BucketizedBufferAllocator,
+    };
+
     class DmlGpuAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlGpuAllocator(onnxruntime::IAllocator* bfcAllocator, std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator);
+        DmlGpuAllocator(
+            onnxruntime::IAllocator* bfcAllocator,
+            BucketizedBufferAllocator* bucketizedBufferAllocator,
+            std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator);
 
         void* Alloc(size_t size_in_bytes) final;
         void Free(void* ptr) final;
-        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
-        AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
+        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes);
+        AllocationInfo* GetAllocationInfo(void* opaquePointer);
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
         DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes);
+        void SetActiveAllocator(ActiveAllocator activeAllocator);
+        uint64_t GetUniqueId(void* opaquePointer);
 
     private:
         // This allocator is managed by ORT and should be used to allocate/free memory in order
         // to utilize the BFC acapabilities
         onnxruntime::IAllocator* m_bfcAllocator;
 
+        // This allocator is the old bucketized allocator that is kept for backward compatibility purposes
+        // and is only used when external custom ops are registered.
+        BucketizedBufferAllocator* m_bucketizedBufferAllocator;
+
         // This allocator is specific to DML and is used to decode the opaque data returned by the BFC
         // allocator into objects that DML understands
-        std::shared_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
+        std::shared_ptr<DmlReservedResourceSubAllocator> m_bfcSubAllocator;
+
+        ActiveAllocator m_activeAllocator;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index dcf6b8607f319..d69bea864b518 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -103,12 +103,11 @@ namespace DmlGraphFusionHelper
             // The allocation is not pooled
             auto allocInfo = static_cast<AllocationInfo*>(opaqueData);
             *allocId = 0;
-            return D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
+            return D3D12BufferRegion(0, allocInfo->GetD3D12Resource()->GetDesc().Width, allocInfo->GetD3D12Resource());
         }
 
-        auto taggedPointer = TaggedPointer::Unpack(opaqueData);
-        *allocId = winmlProvider->TryGetPooledAllocationId(taggedPointer, 0);
-        return winmlProvider->GetBufferRegion(taggedPointer, tensor->SizeInBytes());
+        *allocId = winmlProvider->GetUniqueId(opaqueData);
+        return winmlProvider->GetBufferRegion(opaqueData, tensor->SizeInBytes());
     }
 
     void ProcessInputData(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index bc9a0b15e86fe..e58303a0bfbfc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -47,7 +47,7 @@ namespace Dml
 
     void DmlReservedResourceSubAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        m_defaultRoundingMode = roundingMode;
+        // Nothing to do here; kept for compatibility with the bucketized allocator
     }
 
     static bool GetTilingEnabled(ID3D12Device* device)
@@ -248,6 +248,7 @@ namespace Dml
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
             this,
             ++m_currentAllocationId,
+            0,
             resourceWrapper.Get(),
             size_in_bytes
         );
@@ -285,7 +286,13 @@ namespace Dml
         allocations_by_id_.erase(it);
     }
 
-    void DmlReservedResourceSubAllocator::FreeResource(AllocationInfo* allocInfo)
+    uint64_t DmlReservedResourceSubAllocator::GetUniqueId(void* opaquePointer)
+    {
+        auto taggedPointer = TaggedPointer::Unpack(opaquePointer);
+        return taggedPointer.GetUniqueId();
+    }
+
+    void DmlReservedResourceSubAllocator::FreeResource(AllocationInfo* allocInfo, uint64_t resourceId)
     {
         // Since this allocator is warapped by ORT's BFC allocator, it's possible that the context is already
         // close at this point if the application is winding down.
@@ -338,9 +345,11 @@ namespace Dml
     }
 
     D3D12BufferRegion DmlReservedResourceSubAllocator::CreateBufferRegion(
-        const TaggedPointer& taggedPointer,
+        void* opaquePointer,
         uint64_t size_in_bytes)
     {
+        auto taggedPointer = TaggedPointer::Unpack(opaquePointer);
+
         // We need to access (mutable) state after this point, so we need to lock
         std::unique_lock<std::mutex> lock(mutex_);
 
@@ -354,16 +363,18 @@ namespace Dml
             (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
 
         // Make sure the region we're trying to create fits entirely in the resource
-        assert(it->second->GetUavResource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes);
+        assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes);
 
         return D3D12BufferRegion(
             taggedPointer.offset,
             size_in_bytes,
-            it->second->GetUavResource());
+            it->second->GetD3D12Resource());
     }
 
-    AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(const TaggedPointer& taggedPointer)
+    AllocationInfo* DmlReservedResourceSubAllocator::GetAllocationInfo(void* opaquePointer)
     {
+        auto taggedPointer = TaggedPointer::Unpack(opaquePointer);
+
         // We need to access (mutable) state after this point, so we need to lock
         std::unique_lock<std::mutex> lock(mutex_);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 8049848c8671e..3f2f1c9210c64 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -6,6 +6,7 @@
 #include "ExecutionContext.h"
 #include "DmlAllocationInfo.h"
 #include "DmlBufferRegion.h"
+#include "DmlSubAllocator.h"
 
 namespace Dml
 {
@@ -36,7 +37,7 @@ namespace Dml
     // this case it is better make more but smaller allocations (resulting in
     // smaller heaps); this fallback path is only retained as a last resort for
     // older hardware.
-    class DmlReservedResourceSubAllocator
+    class DmlReservedResourceSubAllocator : public DmlSubAllocator
     {
     public:
         // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
@@ -60,13 +61,14 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(const TaggedPointer& taggedPointer, uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes);
 
-        AllocationInfo* GetAllocationInfo(const TaggedPointer& taggedPointer);
+        AllocationInfo* GetAllocationInfo(void* opaquePointer);
 
-        void FreeResource(AllocationInfo* allocInfo);
+        void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) final;
         uint64_t ComputeRequiredSize(size_t size);
         bool TilingEnabled() const { return tiling_enabled_; };
+        uint64_t GetUniqueId(void* opaquePointer);
 
         ~DmlReservedResourceSubAllocator();
 
@@ -106,7 +108,6 @@ namespace Dml
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
-        AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Enabled;
         std::unique_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
 
     #if _DEBUG
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index 22f8cbbdc394b..de42157645bba 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -17,7 +17,7 @@ namespace Dml
         {
         }
 
-        ID3D12Resource* GetUavResource() const final { return m_allocation.resource_uav_state.Get(); }
+        ID3D12Resource* GetD3D12Resource() const final { return m_allocation.resource_uav_state.Get(); }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 2b1a8e5c726dc..6ad57b055023c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -12,7 +12,7 @@ namespace Dml
     {
     public:
         // TODO (pavignol): Rename to GetResource()
-        virtual ID3D12Resource* GetUavResource() const = 0;
+        virtual ID3D12Resource* GetD3D12Resource() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
new file mode 100644
index 0000000000000..580830ea1a90f
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace Dml
+{
+    struct DmlResourceWrapper;
+
+    class DmlSubAllocator
+    {
+    public:
+        virtual void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) = 0;
+    };
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index b06d23adf5886..cb04939683f8c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -9,6 +9,7 @@
 #include "ReadbackHeap.h"
 #include "ExecutionContext.h"
 #include "DmlReservedResourceSubAllocator.h"
+#include "BucketizedBufferAllocator.h"
 #include "DmlCpuAllocator.h"
 #include "MLOperatorAuthorImpl.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h"
@@ -207,8 +208,16 @@ namespace Dml
 
             m_bfcAllocator = onnxruntime::CreateAllocator(memoryInfo);
 
+            m_bucketizedAllocator = std::make_shared<BucketizedBufferAllocator>(
+                m_d3d12Device.Get(),
+                m_context, // TODO(leca): REVIEW: Will it cause memory issue when m_context is released in EP while alloc is released in sessionState?
+                CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+                D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS,
+                D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
+                D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+
             // Wrap the BFC allocator into our own allocator
-            m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), subAllocator);
+            m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_bucketizedAllocator.get(), subAllocator);
             m_context->SetAllocator(m_gpuAllocator);
             // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators.
             m_cpuInputAllocator = std::make_shared<DmlCpuAllocator>(OrtMemType::OrtMemTypeCPUInput);
@@ -992,15 +1001,21 @@ namespace Dml
         m_context->QueueReference(object);
     }
 
-    D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const
+    D3D12BufferRegion ExecutionProviderImpl::GetBufferRegion(void* opaquePointer, uint64_t size) const
     {
-        return m_gpuAllocator->CreateBufferRegion(taggedPointer, size);
+        return m_gpuAllocator->CreateBufferRegion(opaquePointer, size);
     }
 
-    uint64_t ExecutionProviderImpl::TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator)
+    uint64_t ExecutionProviderImpl::GetUniqueId(void* opaquePointer)
     {
-        assert(!isInternalOperator);
-        return taggedPointer.GetUniqueId();
+        return m_gpuAllocator->GetUniqueId(opaquePointer);
+    }
+
+    void ExecutionProviderImpl::DisableBfcAllocator()
+    {
+        // TODO (pavignol): Remove
+        printf("*************Disabling BFC allocator!!!\n");
+        m_gpuAllocator->SetActiveAllocator(ActiveAllocator::BucketizedBufferAllocator);
     }
 
     void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState(
@@ -1117,10 +1132,10 @@ namespace Dml
         return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
     }
 
-    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, const TaggedPointer& taggedPointer, uint64_t sizeInBytes)
+    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes)
     {
-        Dml::DmlGpuAllocator* pAllocationInfo = static_cast<Dml::DmlGpuAllocator*>(allocator);
-        return pAllocationInfo->CreateBufferRegion(taggedPointer, sizeInBytes);
+        Dml::DmlGpuAllocator* gpuAllocator = static_cast<Dml::DmlGpuAllocator*>(allocator);
+        return gpuAllocator->CreateBufferRegion(opaquePointer, sizeInBytes);
     }
 
     void FlushContext(onnxruntime::IExecutionProvider* provider)
@@ -1141,6 +1156,12 @@ namespace Dml
         dmlexecutionprovider->ReleaseCompletedReferences();
     }
 
+    void DisableBfcAllocator(onnxruntime::IExecutionProvider * provider)
+    {
+        ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
+        dmlexecutionprovider->DisableBfcAllocator();
+    }
+
     onnxruntime::common::Status CopyTensor(
         onnxruntime::IExecutionProvider* provider,
         const onnxruntime::Tensor& src,
@@ -1156,7 +1177,7 @@ namespace Dml
         ComPtr<DmlResourceWrapper> resourceWrapper;
         wil::MakeOrThrow<DmlCommittedResourceWrapper>(pResource).As(&resourceWrapper);
 
-        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(nullptr, 0, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
+        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(nullptr, 0, 0, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
         return allocInfo.Detach();
     }
     void FreeGPUAllocation(void* ptr)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index ad208ea830ae5..dafc2ab7147f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -25,6 +25,7 @@ namespace Dml
     class ReadbackHeap;
     class ExecutionContext;
     class DmlReservedResourceSubAllocator;
+    class BucketizedBufferAllocator;
     class DmlCpuAllocator;
     class ExecutionProvider;
     class DmlGpuAllocator;
@@ -42,6 +43,8 @@ namespace Dml
             bool enableMetacommands = true);
 
         void ReleaseCompletedReferences();
+        void DisableBfcAllocator();
+        uint64_t GetUniqueId(void* opaquePointer);
 
     public: // implements Dml::IExecutionProvider
         STDMETHOD(GetD3DDevice)(_COM_Outptr_ ID3D12Device** d3dDevice) const noexcept final;
@@ -100,9 +103,7 @@ namespace Dml
         // IWinmlExecutionProvider methods
         void QueueReference(IUnknown* object) override;
 
-        D3D12BufferRegion GetBufferRegion(const TaggedPointer& taggedPointer, uint64_t size) const override;
-
-        uint64_t TryGetPooledAllocationId(const TaggedPointer& taggedPointer, bool isInternalOperator) override;
+        D3D12BufferRegion GetBufferRegion(void* opaquePointer, uint64_t size) const override;
 
         void GetABIExecutionInterfaceAndInvalidateState(
             bool isInternalOperator,
@@ -181,8 +182,8 @@ namespace Dml
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<PooledUploadHeap> m_uploadHeap;
         std::unique_ptr<ReadbackHeap> m_readbackHeap;
-        std::shared_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
         std::shared_ptr<onnxruntime::IAllocator> m_bfcAllocator;
+        std::shared_ptr<BucketizedBufferAllocator> m_bucketizedAllocator;
         std::shared_ptr<DmlGpuAllocator> m_gpuAllocator;
         std::shared_ptr<DmlExternalGpuAllocator> m_externalGpuAllocator;
         std::shared_ptr<DmlCpuAllocator> m_cpuInputAllocator;
@@ -292,6 +293,11 @@ namespace Dml
             return m_impl->ReleaseCompletedReferences();
         }
 
+        void DisableBfcAllocator()
+        {
+            return m_impl->DisableBfcAllocator();
+        }
+
         ExecutionProviderImpl* GetImpl()
         {
             return m_impl.Get();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index dde290f0bce0f..1547bd99b6e20 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1617,11 +1617,10 @@ namespace Windows::AI::MachineLearning::Adapter
         if (m_impl->Location().device.MemType() == OrtDevice::MemType::DML_EXTERNAL)
         {
             auto allocInfo = static_cast<Dml::AllocationInfo*>(m_tensorData);
-            return Dml::D3D12BufferRegion(0, allocInfo->GetUavResource()->GetDesc().Width, allocInfo->GetUavResource());
+            return Dml::D3D12BufferRegion(0, allocInfo->GetD3D12Resource()->GetDesc().Width, allocInfo->GetD3D12Resource());
         }
 
-        auto taggedPointer = Dml::TaggedPointer::Unpack(m_tensorData);
-        return m_winmlExecutionProvider->GetBufferRegion(taggedPointer, m_impl->SizeInBytes());
+        return m_winmlExecutionProvider->GetBufferRegion(m_tensorData, m_impl->SizeInBytes());
     }
 
     uint32_t STDMETHODCALLTYPE TensorWrapper::GetDimensionCount() const noexcept
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index d1a705e151ddf..9909be1f8337f 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -177,6 +177,8 @@ IMLOperatorRegistryPrivate : public IUnknown
         _In_reads_(constantCpuInputCount) const uint32_t* constantCpuInputs = nullptr,
         uint32_t constantCpuInputCount = 0
         ) const noexcept PURE;
+
+    STDMETHOD_(bool, HasExternalOperators)() const noexcept PURE;
 };
 
 //! \interface IMLOperatorTensorShapeDescription1
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 91279be185ba9..c226b83e3ad1b 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -225,7 +225,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The resource has been allocated with ");
   }
 
-  *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
+  *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetD3D12Resource();
   (*d3d_resource)->AddRef();
 
 #else
@@ -250,11 +250,11 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation,
   }
 
   if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) {
-    *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetUavResource();
+    *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetD3D12Resource();
     *offset = 0;
   } else {
     ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT);
-    auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), Dml::TaggedPointer::Unpack(allocation), size_in_bytes);
+    auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), allocation, size_in_bytes);
     *offset = bufferRegion.Offset();
     *d3d_resource = bufferRegion.ResourceInUavState();
   }

From 57723394fb1615075fb19113a18dd031c2df8339 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 01:08:08 -0700
Subject: [PATCH 49/76] Add BFC allocator API

---
 .../inc/DmlExecutionProvider.h                |  4 +-
 .../src/DmlGpuAllocator.cpp                   | 10 ++---
 .../src/DmlGpuAllocator.h                     |  4 +-
 .../src/ExecutionProvider.cpp                 | 42 ++++++++++---------
 .../src/ExecutionProvider.h                   | 14 +++----
 .../providers/dml/dml_provider_factory.cc     | 14 ++++++-
 winml/adapter/winml_adapter_apis.h            |  2 +-
 winml/adapter/winml_adapter_c_api.cpp         |  2 +-
 winml/adapter/winml_adapter_c_api.h           |  2 +-
 winml/adapter/winml_adapter_dml.cpp           |  4 +-
 .../Api.Ort/OnnxruntimeDmlSessionBuilder.cpp  | 12 ++++--
 .../Api.Ort/OnnxruntimeDmlSessionBuilder.h    | 10 ++++-
 .../lib/Api.Ort/OnnxruntimeEngineBuilder.cpp  | 11 +++--
 winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h  |  6 ++-
 winml/lib/Api/LearningModelSession.cpp        | 11 +++--
 winml/lib/Common/inc/iengine.h                |  5 ++-
 winml/test/adapter/AdapterDmlEpTest.cpp       |  4 +-
 winml/test/adapter/AdapterSessionTest.cpp     |  6 +--
 18 files changed, 100 insertions(+), 63 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index ef012855770f3..755bf60195e2e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -29,13 +29,13 @@ namespace Dml
     std::unique_ptr<onnxruntime::IExecutionProvider> CreateExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands = true);
+        bool enableMetacommands,
+        bool enableBfcAllocator);
 
     D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t size_in_bytes);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
-    void DisableBfcAllocator(onnxruntime::IExecutionProvider* provider);
 
     onnxruntime::common::Status CopyTensor(
         onnxruntime::IExecutionProvider* provider,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index e2606433ec5b2..5ac6485a041ec 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -16,7 +16,8 @@ namespace Dml
     DmlGpuAllocator::DmlGpuAllocator(
         onnxruntime::IAllocator* bfcAllocator,
         BucketizedBufferAllocator* bucketizedBufferAllocator,
-        std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator)
+        std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator,
+        ActiveAllocator activeAllocator)
     : onnxruntime::IAllocator(
         OrtMemoryInfo(
             onnxruntime::DML,
@@ -28,7 +29,7 @@ namespace Dml
     m_bfcAllocator(bfcAllocator),
     m_bucketizedBufferAllocator(bucketizedBufferAllocator),
     m_bfcSubAllocator(bfcSubAllocator),
-    m_activeAllocator(ActiveAllocator::BfcAllocator) {}
+    m_activeAllocator(activeAllocator) {}
 
     void* DmlGpuAllocator::Alloc(size_t size_in_bytes)
     {
@@ -113,9 +114,4 @@ namespace Dml
         }
     }
 
-    void DmlGpuAllocator::SetActiveAllocator(ActiveAllocator activeAllocator)
-    {
-        m_activeAllocator = activeAllocator;
-    }
-
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index 955c9ca10c7d2..e8b020a85767b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -26,7 +26,8 @@ namespace Dml
         DmlGpuAllocator(
             onnxruntime::IAllocator* bfcAllocator,
             BucketizedBufferAllocator* bucketizedBufferAllocator,
-            std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator);
+            std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator,
+            ActiveAllocator activeAllocator);
 
         void* Alloc(size_t size_in_bytes) final;
         void Free(void* ptr) final;
@@ -34,7 +35,6 @@ namespace Dml
         AllocationInfo* GetAllocationInfo(void* opaquePointer);
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
         DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes);
-        void SetActiveAllocator(ActiveAllocator activeAllocator);
         uint64_t GetUniqueId(void* opaquePointer);
 
     private:
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index cb04939683f8c..9ac474474f8b8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -74,7 +74,8 @@ namespace Dml
     ExecutionProvider::ExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands) :
+        bool enableMetacommands,
+        bool enableBfcAllocator) :
             IExecutionProvider(onnxruntime::kDmlExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0))
     {
         D3D12_COMMAND_LIST_TYPE queueType = commandQueue->GetDesc().Type;
@@ -87,7 +88,7 @@ namespace Dml
         ComPtr<ID3D12Device> device;
         GRAPHICS_THROW_IF_FAILED(commandQueue->GetDevice(IID_GRAPHICS_PPV_ARGS(device.GetAddressOf())));
 
-        m_impl = wil::MakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), commandQueue, enableMetacommands);
+        m_impl = wil::MakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), commandQueue, enableMetacommands, enableBfcAllocator);
     }
 
     std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
@@ -142,10 +143,16 @@ namespace Dml
 // Task 24384515: Update ORT AIInfra release agent pool to install 19H1 SDK on VM bootstrap
 #define D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE ((D3D_FEATURE_LEVEL)0x1000)
 
-    ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands)
+    ExecutionProviderImpl::ExecutionProviderImpl(
+        IDMLDevice* dmlDevice,
+        ID3D12Device* d3d12Device,
+        ID3D12CommandQueue* queue,
+        bool enableMetacommands,
+        bool enableBfcAllocator)
         : m_d3d12Device(d3d12Device),
           m_dmlDevice(dmlDevice),
           m_areMetacommandsEnabled(enableMetacommands),
+          m_bfcAllocatorEnabled(enableBfcAllocator),
           m_queue(queue)
     {
 
@@ -216,8 +223,17 @@ namespace Dml
                 D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
                 D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
+            if (!m_bfcAllocatorEnabled)
+            {
+                printf("*************BFC ALLOCATOR DISABLED!\n");
+            }
+
             // Wrap the BFC allocator into our own allocator
-            m_gpuAllocator = std::make_shared<DmlGpuAllocator>(m_bfcAllocator.get(), m_bucketizedAllocator.get(), subAllocator);
+            m_gpuAllocator = std::make_shared<DmlGpuAllocator>(
+                m_bfcAllocator.get(),
+                m_bucketizedAllocator.get(),
+                subAllocator,
+                m_bfcAllocatorEnabled ? ActiveAllocator::BfcAllocator : ActiveAllocator::BucketizedBufferAllocator);
             m_context->SetAllocator(m_gpuAllocator);
             // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators.
             m_cpuInputAllocator = std::make_shared<DmlCpuAllocator>(OrtMemType::OrtMemTypeCPUInput);
@@ -1011,13 +1027,6 @@ namespace Dml
         return m_gpuAllocator->GetUniqueId(opaquePointer);
     }
 
-    void ExecutionProviderImpl::DisableBfcAllocator()
-    {
-        // TODO (pavignol): Remove
-        printf("*************Disabling BFC allocator!!!\n");
-        m_gpuAllocator->SetActiveAllocator(ActiveAllocator::BucketizedBufferAllocator);
-    }
-
     void ExecutionProviderImpl::GetABIExecutionInterfaceAndInvalidateState(
         bool isInternalOperator,
         IUnknown** abiExecutionObject) const
@@ -1127,9 +1136,10 @@ namespace Dml
     std::unique_ptr<onnxruntime::IExecutionProvider> CreateExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands)
+        bool enableMetacommands,
+        bool enableBfcAllocator)
     {
-        return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
+        return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands, enableBfcAllocator);
     }
 
     D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes)
@@ -1156,12 +1166,6 @@ namespace Dml
         dmlexecutionprovider->ReleaseCompletedReferences();
     }
 
-    void DisableBfcAllocator(onnxruntime::IExecutionProvider * provider)
-    {
-        ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
-        dmlexecutionprovider->DisableBfcAllocator();
-    }
-
     onnxruntime::common::Status CopyTensor(
         onnxruntime::IExecutionProvider* provider,
         const onnxruntime::Tensor& src,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index dafc2ab7147f7..74f56acb345ed 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -40,10 +40,10 @@ namespace Dml
             IDMLDevice* dmlDevice,
             ID3D12Device* d3d12Device,
             ID3D12CommandQueue* queue,
-            bool enableMetacommands = true);
+            bool enableMetacommands,
+            bool enableBfcAllocator);
 
         void ReleaseCompletedReferences();
-        void DisableBfcAllocator();
         uint64_t GetUniqueId(void* opaquePointer);
 
     public: // implements Dml::IExecutionProvider
@@ -178,6 +178,7 @@ namespace Dml
         ComPtr<IDMLDevice> m_dmlDevice;
         bool m_isMcdmDevice = false;
         bool m_areMetacommandsEnabled = true;
+        bool m_bfcAllocatorEnabled = true;
         bool m_native16BitShaderOpsSupported = false;
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<PooledUploadHeap> m_uploadHeap;
@@ -235,8 +236,8 @@ namespace Dml
         explicit ExecutionProvider(
             IDMLDevice* dmlDevice,
             ID3D12CommandQueue* commandQueue,
-            bool enableMetacommands = true
-        );
+            bool enableMetacommands,
+            bool enableBfcAllocator);
 
         std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const final override
         {
@@ -293,11 +294,6 @@ namespace Dml
             return m_impl->ReleaseCompletedReferences();
         }
 
-        void DisableBfcAllocator()
-        {
-            return m_impl->DisableBfcAllocator();
-        }
-
         ExecutionProviderImpl* GetImpl()
         {
             return m_impl.Get();
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index c226b83e3ad1b..b2d02715bb91d 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -35,18 +35,19 @@ struct DMLProviderFactory : IExecutionProviderFactory {
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override;
   void SetDefaultRoundingMode(AllocatorRoundingMode rounding_mode);
-
   void SetMetacommandsEnabled(bool metacommands_enabled);
+  void SetBfcAllocatorEnabled(bool bfc_allocator_enabled);
 
  private:
   ComPtr<IDMLDevice> dml_device_{};
   ComPtr<ID3D12CommandQueue> cmd_queue_{};
   AllocatorRoundingMode rounding_mode_ = AllocatorRoundingMode::Enabled;
   bool metacommands_enabled_ = true;
+  bool bfc_allocator_enabled_ = true;
 };
 
 std::unique_ptr<IExecutionProvider> DMLProviderFactory::CreateProvider() {
-  auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), cmd_queue_.Get(), metacommands_enabled_);
+  auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), cmd_queue_.Get(), metacommands_enabled_, bfc_allocator_enabled_);
   Dml::SetDefaultRoundingMode(provider.get(), rounding_mode_);
   return provider;
 }
@@ -59,6 +60,10 @@ void DMLProviderFactory::SetMetacommandsEnabled(bool metacommands_enabled) {
   metacommands_enabled_ = metacommands_enabled;
 }
 
+void DMLProviderFactory::SetBfcAllocatorEnabled(bool bfc_allocator_enabled) {
+  bfc_allocator_enabled_ = bfc_allocator_enabled;
+}
+
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_DML(IDMLDevice* dml_device,
                                                                               ID3D12CommandQueue* cmd_queue) {
 #ifndef _GAMING_XBOX
@@ -92,6 +97,11 @@ void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* f
   dml_provider_factory->SetMetacommandsEnabled(metacommandsEnabled);
 }
 
+void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled) {
+  auto dml_provider_factory = static_cast<DMLProviderFactory*>(factory);
+  dml_provider_factory->SetBfcAllocatorEnabled(bfc_allocator_enabled);
+}
+
 
 bool IsSoftwareAdapter(IDXGIAdapter1* adapter) {
     DXGI_ADAPTER_DESC1 desc;
diff --git a/winml/adapter/winml_adapter_apis.h b/winml/adapter/winml_adapter_apis.h
index 7d1f7941f9865..6a31676c048a5 100644
--- a/winml/adapter/winml_adapter_apis.h
+++ b/winml/adapter/winml_adapter_apis.h
@@ -42,7 +42,7 @@ ORT_API_STATUS(ModelGetMetadata, _In_ const OrtModel* model, _In_ size_t count,
 ORT_API_STATUS(ModelEnsureNoFloat16, _In_ const OrtModel* model);
 ORT_API_STATUS(SaveModel, _In_ const OrtModel* in, _In_ const wchar_t* const file_name, _In_ size_t len);
 
-ORT_API_STATUS(OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSessionOptions* options, _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* cmd_queue, bool metacommands_enabled);
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSessionOptions* options, _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* cmd_queue, bool metacommands_enabled, bool bfc_allocator_enabled);
 
 // OrtSession methods
 ORT_API_STATUS(CreateSessionWithoutModel, _In_ OrtEnv* env, _In_ const OrtSessionOptions* options, _In_ OrtThreadPool* inter_op_thread_pool, _In_ OrtThreadPool* intra_op_thread_pool, _Outptr_ OrtSession** session);
diff --git a/winml/adapter/winml_adapter_c_api.cpp b/winml/adapter/winml_adapter_c_api.cpp
index a3e597fb88800..86d23a362c16d 100644
--- a/winml/adapter/winml_adapter_c_api.cpp
+++ b/winml/adapter/winml_adapter_c_api.cpp
@@ -105,4 +105,4 @@ const WinmlAdapterApi* ORT_API_CALL OrtGetWinMLAdapter(_In_ uint32_t ort_version
   }
 
   return nullptr;
-}
\ No newline at end of file
+}
diff --git a/winml/adapter/winml_adapter_c_api.h b/winml/adapter/winml_adapter_c_api.h
index 2817467818404..1cbd969e559b4 100644
--- a/winml/adapter/winml_adapter_c_api.h
+++ b/winml/adapter/winml_adapter_c_api.h
@@ -255,7 +255,7 @@ struct WinmlAdapterApi {
     * OrtSessionOptionsAppendExecutionProvider_DML
 	 * This api is used to add the DML EP to OrtSessionOptions.
     */
-  OrtStatus*(ORT_API_CALL* OrtSessionOptionsAppendExecutionProvider_DML)(_In_ OrtSessionOptions* options, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled)NO_EXCEPTION;
+  OrtStatus*(ORT_API_CALL* OrtSessionOptionsAppendExecutionProvider_DML)(_In_ OrtSessionOptions* options, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled, bool bfc_allocator_enabled)NO_EXCEPTION;
 
   // OrtSession methods
 
diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp
index f3ffda496530f..3dc1b1bdcd55d 100644
--- a/winml/adapter/winml_adapter_dml.cpp
+++ b/winml/adapter/winml_adapter_dml.cpp
@@ -70,12 +70,13 @@ Microsoft::WRL::ComPtr<IDMLDevice> CreateDmlDevice(ID3D12Device* d3d12Device) {
 namespace onnxruntime {
 void DmlConfigureProviderFactoryDefaultRoundingMode(onnxruntime::IExecutionProviderFactory* factory, AllocatorRoundingMode rounding_mode);
 void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled);
+void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled);
 }
 
 #endif  // USE_DML
 
 ORT_API_STATUS_IMPL(winmla::OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSessionOptions* options,
-                    _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* queue, bool metacommands_enabled) {
+                    _In_ ID3D12Device* d3d_device, _In_ ID3D12CommandQueue* queue, bool metacommands_enabled, bool bfc_allocator_enabled) {
   API_IMPL_BEGIN
 #ifdef USE_DML
   auto dml_device = CreateDmlDevice(d3d_device);
@@ -91,6 +92,7 @@ ORT_API_STATUS_IMPL(winmla::OrtSessionOptionsAppendExecutionProviderEx_DML, _In_
   onnxruntime::DmlConfigureProviderFactoryDefaultRoundingMode(factory, AllocatorRoundingMode::Disabled);
 
   onnxruntime::DmlConfigureProviderFactoryMetacommandsEnabled(factory, metacommands_enabled);
+  onnxruntime::DmlConfigureProviderFactoryBfcAllocatorEnabled(factory, bfc_allocator_enabled);
 #endif  // USE_DML
   return nullptr;
   API_IMPL_END
diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
index a02c6a0431ba6..fea3b4ebbea91 100644
--- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
@@ -12,11 +12,17 @@
 
 using namespace _winml;
 
-HRESULT OnnxruntimeDmlSessionBuilder::RuntimeClassInitialize(OnnxruntimeEngineFactory* engine_factory, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled) {
+HRESULT OnnxruntimeDmlSessionBuilder::RuntimeClassInitialize(
+  OnnxruntimeEngineFactory* engine_factory,
+  ID3D12Device* device,
+  ID3D12CommandQueue* queue,
+  bool metacommands_enabled,
+  bool bfc_allocator_enabled) {
   engine_factory_ = engine_factory;
   device_.copy_from(device);
   queue_.copy_from(queue);
   metacommands_enabled_ = metacommands_enabled;
+  bfc_allocator_enabled_ = bfc_allocator_enabled;
   return S_OK;
 }
 
@@ -43,7 +49,7 @@ OnnxruntimeDmlSessionBuilder::CreateSessionOptions(
                           ort_api);
 
   // Request the dml ep
-  RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device_.get(), queue_.get(), metacommands_enabled_),
+  RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device_.get(), queue_.get(), metacommands_enabled_, bfc_allocator_enabled_),
                           ort_api);
 
 #ifndef _WIN64
@@ -105,4 +111,4 @@ HRESULT OnnxruntimeDmlSessionBuilder::Initialize(
   return S_OK;
 }
 
-#endif USE_DML
\ No newline at end of file
+#endif USE_DML
diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h
index 8ea4399ebfb35..261beb4191abe 100644
--- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h
+++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h
@@ -13,7 +13,12 @@ class OnnxruntimeDmlSessionBuilder : public Microsoft::WRL::RuntimeClass<
                                          Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>,
                                          IOrtSessionBuilder> {
  public:
-  HRESULT RuntimeClassInitialize(OnnxruntimeEngineFactory* engine_factory, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled_);
+  HRESULT RuntimeClassInitialize(
+    OnnxruntimeEngineFactory* engine_factory,
+    ID3D12Device* device,
+    ID3D12CommandQueue* queue,
+    bool metacommands_enabled,
+    bool bfc_allocator_enabled);
 
   HRESULT STDMETHODCALLTYPE CreateSessionOptions(
       OrtSessionOptions** options) override;
@@ -32,6 +37,7 @@ class OnnxruntimeDmlSessionBuilder : public Microsoft::WRL::RuntimeClass<
   winrt::com_ptr<ID3D12Device> device_;
   winrt::com_ptr<ID3D12CommandQueue> queue_;
   bool metacommands_enabled_ = true;
+  bool bfc_allocator_enabled_ = true;
 };
 
-}  // namespace _winml
\ No newline at end of file
+}  // namespace _winml
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp
index f98a98476d0e6..b7ee7de25ea1e 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp
@@ -28,7 +28,7 @@ STDMETHODIMP OnnxruntimeEngineBuilder::CreateEngine(_Outptr_ _winml::IEngine** o
     RETURN_IF_FAILED(Microsoft::WRL::MakeAndInitialize<OnnxruntimeCpuSessionBuilder>(&onnxruntime_session_builder, engine_factory_.Get()));
   } else {
 #ifdef USE_DML
-    RETURN_IF_FAILED(Microsoft::WRL::MakeAndInitialize<OnnxruntimeDmlSessionBuilder>(&onnxruntime_session_builder, engine_factory_.Get(), device_.Get(), queue_.Get(), metacommands_enabled_));
+    RETURN_IF_FAILED(Microsoft::WRL::MakeAndInitialize<OnnxruntimeDmlSessionBuilder>(&onnxruntime_session_builder, engine_factory_.Get(), device_.Get(), queue_.Get(), metacommands_enabled_, bfc_allocator_enabled_));
 #endif
   }
 
@@ -86,6 +86,11 @@ STDMETHODIMP OnnxruntimeEngineBuilder::SetMetacommandsEnabled(int enabled) {
   return S_OK;
 }
 
+STDMETHODIMP OnnxruntimeEngineBuilder::SetBfcAllocatorEnabled(int enabled) {
+  bfc_allocator_enabled_ = static_cast<bool>(enabled);
+  return S_OK;
+}
+
 STDMETHODIMP OnnxruntimeEngineBuilder::GetID3D12CommandQueue(_Outptr_ ID3D12CommandQueue** queue) {
   *queue = queue_.Get();
   return S_OK;
@@ -100,7 +105,7 @@ STDMETHODIMP OnnxruntimeEngineBuilder::SetNamedDimensionOverrides(wfc::IMapView<
   named_dimension_overrides_ = std::move(named_dimension_overrides);
   return S_OK;
 }
-  
+
 STDMETHODIMP OnnxruntimeEngineBuilder::SetIntraOpNumThreadsOverride(uint32_t intra_op_num_threads) {
   intra_op_num_threads_override_ = intra_op_num_threads;
   return S_OK;
@@ -114,4 +119,4 @@ STDMETHODIMP OnnxruntimeEngineBuilder::SetIntraOpThreadSpinning(bool allow_spinn
 STDMETHODIMP OnnxruntimeEngineBuilder::SetThreadPool(IThreading* thread_pool) {
   thread_pool_ = thread_pool;
   return S_OK;
-}
\ No newline at end of file
+}
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h
index 42cb57190e93f..4bed120df8809 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.h
@@ -17,6 +17,9 @@ class OnnxruntimeEngineBuilder : public Microsoft::WRL::RuntimeClass<
   STDMETHOD(SetMetacommandsEnabled)
   (int enabled);
 
+  STDMETHOD(SetBfcAllocatorEnabled)
+  (int enabled);
+
   STDMETHOD(GetD3D12Device)
   (_Outptr_ ID3D12Device** device);
 
@@ -47,10 +50,11 @@ class OnnxruntimeEngineBuilder : public Microsoft::WRL::RuntimeClass<
   Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue_ = nullptr;
   Microsoft::WRL::ComPtr<IThreading> thread_pool_ = nullptr;
   bool metacommands_enabled_ = true;
+  bool bfc_allocator_enabled_ = true;
   std::optional<uint32_t> batch_size_override_;
   wfc::IMapView<winrt::hstring, uint32_t> named_dimension_overrides_;
   std::optional<uint32_t> intra_op_num_threads_override_;
   bool allow_thread_spinning_ = true;
 };
 
-}  // namespace _winml
\ No newline at end of file
+}  // namespace _winml
diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp
index f086f756ed990..24b3aff92824e 100644
--- a/winml/lib/Api/LearningModelSession.cpp
+++ b/winml/lib/Api/LearningModelSession.cpp
@@ -13,6 +13,7 @@
 #include "LearningModelSessionOptions.h"
 #include "TensorFeatureDescriptor.h"
 #include "TelemetryEvent.h"
+#include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h"
 
 #include "D3DDeviceCache.h"
 
@@ -31,7 +32,7 @@ LearningModelSession::LearningModelSession(_winml::IEngine* engine) : operator_r
                                                                       model_(nullptr),
                                                                       device_(LearningModelDeviceKind::Cpu),
                                                                       session_options_(nullptr)
-{ 
+{
     engine_.copy_from(engine);
 }
 
@@ -117,6 +118,10 @@ void LearningModelSession::Initialize() {
   if (device_impl->IsCpuDevice() == false) {
     WINML_THROW_IF_FAILED(engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue()));
     WINML_THROW_IF_FAILED(engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled()));
+
+    winrt::com_ptr<IMLOperatorRegistryPrivate> registryPrivate;
+    WINML_THROW_IF_FAILED(model_impl->GetOperatorRegistry()->QueryInterface(IID_PPV_ARGS(registryPrivate.put())));
+    WINML_THROW_IF_FAILED(engine_builder->SetBfcAllocatorEnabled(!registryPrivate->HasExternalOperators()));
   }
 
   auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads();
@@ -137,7 +142,7 @@ void LearningModelSession::Initialize() {
     allow_spinning = session_options_impl->GetIntraOpThreadSpinning();
     num_intra_op_threads = session_options_impl->GetIntraOpNumThreads();
   }
-  
+
   bool create_local_thread_pool = allow_spinning != device_impl->AllowSpinning() ||
                                   num_intra_op_threads != device_impl->NumberOfIntraOpThreads();
   if (create_local_thread_pool) {
@@ -464,4 +469,4 @@ winml::LearningModelSession LearningModelSession::CreateInertSession(_winml::IEn
   return winrt::make<winmlp::LearningModelSession>(engine);
 }
 
-}  // namespace WINMLP
\ No newline at end of file
+}  // namespace WINMLP
diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h
index a686b585841c3..66fffcd1dd043 100644
--- a/winml/lib/Common/inc/iengine.h
+++ b/winml/lib/Common/inc/iengine.h
@@ -209,7 +209,7 @@ IThreading : IUnknown {
 
 };
 
-MIDL_INTERFACE("8ac0b6b9-4561-492b-b63d-a07bdd8292c6")
+MIDL_INTERFACE("edf7b6d1-f788-4057-9f99-28f9b05360e8")
 IEngineBuilder : IUnknown {
   STDMETHOD(SetD3D12Resources)
   (ID3D12Device * device, ID3D12CommandQueue * queue) PURE;
@@ -217,6 +217,9 @@ IEngineBuilder : IUnknown {
   STDMETHOD(SetMetacommandsEnabled)
   (int enabled) PURE;
 
+  STDMETHOD(SetBfcAllocatorEnabled)
+  (int enabled) PURE;
+
   STDMETHOD(GetD3D12Device)
   (_Outptr_ ID3D12Device * *device) PURE;
 
diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp
index 3069c618b1738..d57418b6e686f 100644
--- a/winml/test/adapter/AdapterDmlEpTest.cpp
+++ b/winml/test/adapter/AdapterDmlEpTest.cpp
@@ -71,7 +71,7 @@ UniqueOrtSession CreateDmlSession() {
   command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
   WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put())));
 
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false), ort_api);
+  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false, true), ort_api);
   return CreateUniqueOrtSession(FileHelpers::GetModulePath() + L"fns-candy.onnx", session_options);
 }
 
@@ -218,7 +218,7 @@ void DmlCopyTensor() {
   command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
   WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put())));
 
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false), ort_api);
+  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), false, true), ort_api);
   auto session = CreateUniqueOrtSession(FileHelpers::GetModulePath() + L"fns-candy.onnx", session_options);
 
   OrtExecutionProvider* dml_provider;
diff --git a/winml/test/adapter/AdapterSessionTest.cpp b/winml/test/adapter/AdapterSessionTest.cpp
index 941157f4f9ecb..bb90ca3656192 100644
--- a/winml/test/adapter/AdapterSessionTest.cpp
+++ b/winml/test/adapter/AdapterSessionTest.cpp
@@ -92,7 +92,7 @@ void AppendExecutionProvider_DML() {
 
   const auto device = CreateD3DDevice();
   const auto queue = CreateD3DQueue(device.get());
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true), ort_api);
+  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true, true), ort_api);
 }
 
 void CreateWithoutModel() {
@@ -114,7 +114,7 @@ void GetExecutionProvider_DML() {
   THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api);
   const auto device = CreateD3DDevice();
   const auto queue = CreateD3DQueue(device.get());
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true), ort_api);
+  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true, true), ort_api);
 
   const auto model_path = FileHelpers::GetModulePath() + L"fns-candy.onnx";
   auto session = CreateUniqueOrtSession(model_path, session_options);
@@ -254,7 +254,7 @@ void CopyInputAcrossDevices_DML() {
   THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api);
   const auto device = CreateD3DDevice();
   const auto queue = CreateD3DQueue(device.get());
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true), ort_api);
+  THROW_IF_NOT_OK_MSG(winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(session_options.get(), device.get(), queue.get(), true, true), ort_api);
   auto session = CreateUniqueOrtSession(session_options);
 
   LoadAndPurloinModel(session, "fns-candy.onnx");

From b06678a9cb986e0e9e102d5f15951d5ede3b6cf9 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 08:08:04 -0700
Subject: [PATCH 50/76] Fix crash

---
 winml/lib/Api/LearningModelSession.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp
index 24b3aff92824e..8dfb0769d3adf 100644
--- a/winml/lib/Api/LearningModelSession.cpp
+++ b/winml/lib/Api/LearningModelSession.cpp
@@ -119,9 +119,12 @@ void LearningModelSession::Initialize() {
     WINML_THROW_IF_FAILED(engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue()));
     WINML_THROW_IF_FAILED(engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled()));
 
-    winrt::com_ptr<IMLOperatorRegistryPrivate> registryPrivate;
-    WINML_THROW_IF_FAILED(model_impl->GetOperatorRegistry()->QueryInterface(IID_PPV_ARGS(registryPrivate.put())));
-    WINML_THROW_IF_FAILED(engine_builder->SetBfcAllocatorEnabled(!registryPrivate->HasExternalOperators()));
+    if (model_impl->GetOperatorRegistry())
+    {
+      winrt::com_ptr<IMLOperatorRegistryPrivate> registryPrivate;
+      WINML_THROW_IF_FAILED(model_impl->GetOperatorRegistry()->QueryInterface(IID_PPV_ARGS(registryPrivate.put())));
+      WINML_THROW_IF_FAILED(engine_builder->SetBfcAllocatorEnabled(!registryPrivate->HasExternalOperators()));
+    }
   }
 
   auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads();

From bf177f656590d8f011995136649d44b61ec8490e Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 10:33:20 -0700
Subject: [PATCH 51/76] Fix prefast error

---
 winml/lib/Api.Image/VideoFrameToTensorConverter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index 5536f7df203b7..4767228579b0b 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -498,8 +498,8 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
 
   assert(convertedSoftwareBitmap != nullptr);
 
-  uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
-  uint32_t bufferSize = static_cast<uint32_t>(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize);
+  uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
+  uint64_t bufferSize = tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize;
 
   // TODO: Make an allocator for upload heaps
   if (!upload_heap_ || upload_heap_->GetDesc().Width < bufferSize) {

From cb2e420ce9de26a7af38f26f6490144070499f64 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 11:42:57 -0700
Subject: [PATCH 52/76] Fix Bucketized allocator crash

---
 .../DmlExecutionProvider/src/BucketizedBufferAllocator.cpp   | 5 +++++
 .../src/DmlReservedResourceSubAllocator.cpp                  | 3 +--
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp       | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index c071e4bf0b8d3..675f17e4c28af 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -96,6 +96,11 @@ namespace Dml
     D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const
     {
         auto allocationInfo = static_cast<AllocationInfo*>(opaquePointer);
+
+        // Make sure that we are aligned to 4 bytes to satisfy DML's requirements
+        constexpr uint64_t DML_ALIGNMENT = 4;
+        size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
+
         return D3D12BufferRegion(0, size_in_bytes, allocationInfo->GetD3D12Resource());
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index e58303a0bfbfc..0a63146286336 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -359,8 +359,7 @@ namespace Dml
 
         // Make sure that we are aligned to 4 bytes to satisfy DML's requirements
         constexpr uint64_t DML_ALIGNMENT = 4;
-        size_in_bytes =
-            (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
+        size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
 
         // Make sure the region we're trying to create fits entirely in the resource
         assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 9ac474474f8b8..888f672d34d0c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -152,7 +152,7 @@ namespace Dml
         : m_d3d12Device(d3d12Device),
           m_dmlDevice(dmlDevice),
           m_areMetacommandsEnabled(enableMetacommands),
-          m_bfcAllocatorEnabled(enableBfcAllocator),
+          m_bfcAllocatorEnabled(false), // TODO (pavignol): Revert
           m_queue(queue)
     {
 
@@ -223,6 +223,7 @@ namespace Dml
                 D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
                 D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
+            // TODO (pavignol): Remove
             if (!m_bfcAllocatorEnabled)
             {
                 printf("*************BFC ALLOCATOR DISABLED!\n");

From 9c79b1b30d76beefd3e3119b425588578dabb553 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 13:32:38 -0700
Subject: [PATCH 53/76] Address prefast errors

---
 .../src/BucketizedBufferAllocator.h           |  5 ++--
 .../src/DmlAllocationInfo.h                   |  2 +-
 .../DmlExecutionProvider/src/DmlBuffer.cpp    |  8 +++----
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  7 +++---
 .../src/DmlBufferRegion.cpp                   |  6 ++---
 .../src/DmlBufferRegion.h                     |  6 ++---
 .../src/DmlGpuAllocator.cpp                   |  2 ++
 .../src/DmlReservedResourceSubAllocator.cpp   |  6 ++---
 .../src/DmlReservedResourceSubAllocator.h     |  5 ++--
 .../src/DmlResourceWrapper.h                  |  1 -
 .../src/ExecutionContext.cpp                  |  2 --
 .../src/ExecutionProvider.cpp                 | 24 +++++++------------
 .../src/FusedGraphKernel.cpp                  |  4 ++--
 .../src/MLOperatorAuthorImpl.cpp              | 10 ++++----
 .../src/Operators/DmlDFT.h                    | 22 ++++++++---------
 .../src/Operators/DmlGridSample.h             | 16 ++++++-------
 .../src/Operators/DmlSTFT.h                   |  6 ++---
 .../providers/dml/dml_provider_factory.cc     |  2 +-
 .../Api.Image/TensorToVideoFrameConverter.cpp |  6 ++---
 .../Api.Image/VideoFrameToTensorConverter.cpp |  2 +-
 20 files changed, 68 insertions(+), 74 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index f0fc570d4e1c4..17524c83c6094 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -19,8 +19,6 @@ namespace Dml
     class BucketizedBufferAllocator : public onnxruntime::IAllocator, public DmlSubAllocator
     {
     public:
-        ~BucketizedBufferAllocator();
-
         // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties,
         // resource flags, and initial resource state.
         BucketizedBufferAllocator(
@@ -46,6 +44,9 @@ namespace Dml
         void* Alloc(size_t size) final;
         void Free(void* p) final;
 
+    protected:
+        ~BucketizedBufferAllocator();
+
     private:
         static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
index ee203ba47056e..f61e59edd5159 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlAllocationInfo.h
@@ -38,7 +38,7 @@ namespace Dml
             return m_resourceWrapper->GetD3D12Resource();
         }
 
-        ComPtr<DmlResourceWrapper> DetachResourceWrapper() const
+        ComPtr<DmlResourceWrapper> DetachResourceWrapper()
         {
             return std::move(m_resourceWrapper);
         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index 0b670a22f9cbd..21b5da96ce236 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -25,7 +25,7 @@ DmlBuffer::~DmlBuffer()
     }
 }
 
-DmlBuffer::DmlBuffer(DmlBuffer&& other)
+DmlBuffer::DmlBuffer(DmlBuffer&& other) noexcept
 {
     m_opaqueData = other.m_opaqueData;
     allocator_ = other.allocator_;
@@ -33,7 +33,7 @@ DmlBuffer::DmlBuffer(DmlBuffer&& other)
     other.m_opaqueData = nullptr;
 }
 
-DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other)
+DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) noexcept
 {
     m_opaqueData = other.m_opaqueData;
     allocator_ = other.allocator_;
@@ -42,9 +42,9 @@ DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other)
     return *this;
 }
 
-ID3D12Resource* DmlBuffer::ResourceInUavState() const
+ID3D12Resource* DmlBuffer::GetD3D12Resource() const
 {
-    return buffer_region_.ResourceInUavState();
+    return buffer_region_.GetD3D12Resource();
 }
 
 uint64_t DmlBuffer::Offset() const
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index 4b0dd58ce4467..019d186441da5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -24,11 +24,10 @@ class DmlBuffer
     // Move-only
     DmlBuffer(const DmlBuffer&) = delete;
     DmlBuffer& operator=(const DmlBuffer&) = delete;
-    DmlBuffer(DmlBuffer&&);
-    DmlBuffer& operator=(DmlBuffer&&);
+    DmlBuffer(DmlBuffer&&) noexcept;
+    DmlBuffer& operator=(DmlBuffer&&) noexcept;
 
-    // TODO (pavignol): Rename to Resource()
-    ID3D12Resource* ResourceInUavState() const;
+    ID3D12Resource* GetD3D12Resource() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
     const D3D12BufferRegion& Region() const { return buffer_region_; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index c33cc5491c7f0..57c4d5b342bb8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -27,14 +27,14 @@ namespace Dml
         assert(m_resource->GetDesc().Width == buffer_size);
     }
 
-    D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that)
+    D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) noexcept
     {
         std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
         std::swap(this->size_in_bytes_, that.size_in_bytes_);
     }
 
-    D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that)
+    D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) noexcept
     {
         std::swap(this->m_resource, that.m_resource);
         std::swap(this->offset_, that.offset_);
@@ -42,7 +42,7 @@ namespace Dml
         return *this;
     }
 
-    ID3D12Resource* D3D12BufferRegion::ResourceInUavState() const
+    ID3D12Resource* D3D12BufferRegion::GetD3D12Resource() const
     {
         return m_resource;
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index 6c5cb37297caa..40c41f980b011 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -24,9 +24,9 @@ namespace Dml
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
         D3D12BufferRegion& operator=(const D3D12BufferRegion&) = default;
-        D3D12BufferRegion(D3D12BufferRegion&&);
-        D3D12BufferRegion& operator=(D3D12BufferRegion&&);
-        ID3D12Resource* ResourceInUavState() const;
+        D3D12BufferRegion(D3D12BufferRegion&&) noexcept;
+        D3D12BufferRegion& operator=(D3D12BufferRegion&&) noexcept;
+        ID3D12Resource* GetD3D12Resource() const;
 
         uint64_t Offset() const;
         uint64_t SizeInBytes() const;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 5ac6485a041ec..881478c3e874f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -89,8 +89,10 @@ namespace Dml
         {
         case ActiveAllocator::BfcAllocator:
             m_bfcSubAllocator->SetDefaultRoundingMode(roundingMode);
+            break;
         case ActiveAllocator::BucketizedBufferAllocator:
             m_bucketizedBufferAllocator->SetDefaultRoundingMode(roundingMode);
+            break;
         default:
             ORT_THROW_HR(E_UNEXPECTED);
         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index 0a63146286336..4fdd6411d555a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -148,9 +148,9 @@ namespace Dml
             // Target range in the current heap to map.
             const D3D12_TILE_RANGE_FLAGS tile_range_flags =
                 D3D12_TILE_RANGE_FLAG_NONE;
-            const uint32_t heap_range_start_offset = 0;
             const uint32_t heap_range_tile_count = static_cast<uint32_t>(heap_size_in_tiles);
 
+            constexpr uint32_t heap_range_start_offset = 0;
             constexpr uint32_t numResourceRegions = 1;
             constexpr uint32_t numHeapRanges = 1;
 
@@ -262,8 +262,8 @@ namespace Dml
     #endif
 
         // DML only has a single device in ORT at the moment
-        const uint64_t device_id = 0;
-        const uint64_t offset = 0;
+        constexpr uint64_t device_id = 0;
+        constexpr uint64_t offset = 0;
         return TaggedPointer::Pack(device_id, *allocationId, offset);
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 3f2f1c9210c64..249de73de6487 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -70,8 +70,6 @@ namespace Dml
         bool TilingEnabled() const { return tiling_enabled_; };
         uint64_t GetUniqueId(void* opaquePointer);
 
-        ~DmlReservedResourceSubAllocator();
-
         // Constructs a DmlReservedResourceSubAllocator which allocates D3D12 committed resources with the specified heap properties,
         // resource flags, and initial resource state.
         DmlReservedResourceSubAllocator(
@@ -83,6 +81,9 @@ namespace Dml
         void* Alloc(size_t size);
         void Free(void* p);
 
+    protected:
+        ~DmlReservedResourceSubAllocator();
+
     private:
         static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
index 6ad57b055023c..876487242aa37 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlResourceWrapper.h
@@ -11,7 +11,6 @@ namespace Dml
     DmlResourceWrapper : public IUnknown
     {
     public:
-        // TODO (pavignol): Rename to GetResource()
         virtual ID3D12Resource* GetD3D12Resource() const = 0;
         virtual ~DmlResourceWrapper(){}
     };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index 4ff464e0eef42..86f964651b638 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -43,8 +43,6 @@ namespace Dml
             // cannot be both in a source and destination state at the same time (without aliasing), we copy
             // the source resource to an intermediate one, and then copy the intermediate resource to the
             // destination resource.
-            // TODO (pavignol): Only do the intermediate copy when both resources at the same
-
             D3D12_HEAP_PROPERTIES heapProperties = {
                 D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 888f672d34d0c..e8b198d369c01 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -129,7 +129,7 @@ namespace Dml
     {
         ORT_TRY
         {
-            return GetBufferForTensor(tensor).ResourceInUavState();
+            return GetBufferForTensor(tensor).GetD3D12Resource();
         }
         ORT_CATCH_GENERIC
         {
@@ -152,7 +152,7 @@ namespace Dml
         : m_d3d12Device(d3d12Device),
           m_dmlDevice(dmlDevice),
           m_areMetacommandsEnabled(enableMetacommands),
-          m_bfcAllocatorEnabled(false), // TODO (pavignol): Revert
+          m_bfcAllocatorEnabled(enableBfcAllocator),
           m_queue(queue)
     {
 
@@ -223,12 +223,6 @@ namespace Dml
                 D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
                 D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
-            // TODO (pavignol): Remove
-            if (!m_bfcAllocatorEnabled)
-            {
-                printf("*************BFC ALLOCATOR DISABLED!\n");
-            }
-
             // Wrap the BFC allocator into our own allocator
             m_gpuAllocator = std::make_shared<DmlGpuAllocator>(
                 m_bfcAllocator.get(),
@@ -469,7 +463,7 @@ namespace Dml
             // CPU -> GPU copy (upload)
             //
             auto dstBufferRegion = GetBufferForTensor(dst);
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            ID3D12Resource* dstData = dstBufferRegion.GetD3D12Resource();
             const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
             m_uploadHeap->BeginUploadToGpu(dstData, dstOffset, dstState, AsByteSpan(src->GetData(), dataSizeInBytes));
@@ -481,7 +475,7 @@ namespace Dml
             // GPU -> CPU copy (readback)
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource();
             const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t srcOffset = srcBufferRegion.Offset();
             m_readbackHeap->ReadbackFromGpu(AsByteSpan(dst->GetData(), dataSizeInBytes), srcData, srcOffset, srcState);
@@ -492,12 +486,12 @@ namespace Dml
             // GPU -> GPU copy
             //
             auto srcBufferRegion = GetBufferForTensor(src);
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource();
             const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t srcOffset = srcBufferRegion.Offset();
 
             auto dstBufferRegion = GetBufferForTensor(dst);
-            ID3D12Resource* dstData = dstBufferRegion.ResourceInUavState();
+            ID3D12Resource* dstData = dstBufferRegion.GetD3D12Resource();
             const auto dstState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
             const uint64_t dstOffset = dstBufferRegion.Offset();
 
@@ -554,7 +548,7 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(src[i]);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource();
             const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
@@ -581,7 +575,7 @@ namespace Dml
         if (mlTensor != nullptr)
         {
             auto dstBufferRegion = GetBufferForTensor(dst);
-            m_context->FillBufferWithPattern(dstBufferRegion.ResourceInUavState(), dstBufferRegion.Offset(), rawValue);
+            m_context->FillBufferWithPattern(dstBufferRegion.GetD3D12Resource(), dstBufferRegion.Offset(), rawValue);
         }
 
         return S_OK;
@@ -982,7 +976,7 @@ namespace Dml
 
             auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
 
-            ID3D12Resource* srcData = srcBufferRegion.ResourceInUavState();
+            ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource();
             const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
 
             srcDatas.push_back(srcData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index b00b8f8e19f52..e60845f8bb146 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -65,7 +65,7 @@ namespace Dml
             if (persistentResourceSize > 0)
             {
                 auto buffer = m_provider->AllocatePooledResource(persistentResourceSize);
-                m_persistentResource = buffer.ResourceInUavState();
+                m_persistentResource = buffer.GetD3D12Resource();
                 m_persistentResourceBinding = buffer.GetBufferBinding();
                 m_managedPersistentBuffer = wil::MakeOrThrow<DmlManagedBuffer>(std::move(buffer));
                 m_winmlProvider->QueueReference(m_managedPersistentBuffer.Get());
@@ -187,7 +187,7 @@ namespace Dml
                 {
                     bufferBindings.push_back(bufferRegion.GetBufferBinding());
 
-                    if (bufferRegion.ResourceInUavState() != nullptr)
+                    if (bufferRegion.GetD3D12Resource() != nullptr)
                     {
                         bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
                     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 1547bd99b6e20..86069139d7f69 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1710,8 +1710,8 @@ namespace Windows::AI::MachineLearning::Adapter
         else
         {
             auto bufferRegion = GetBufferRegion();
-            bufferRegion.ResourceInUavState()->AddRef();
-            *dataInterface = bufferRegion.ResourceInUavState();
+            bufferRegion.GetD3D12Resource()->AddRef();
+            *dataInterface = bufferRegion.GetD3D12Resource();
         }
     }
 
@@ -1768,7 +1768,7 @@ namespace Windows::AI::MachineLearning::Adapter
 
             for (auto& tempBuffer : m_temporaryBuffers)
             {
-                resourcesToTransition.push_back(tempBuffer.ResourceInUavState());
+                resourcesToTransition.push_back(tempBuffer.GetD3D12Resource());
             }
 
             m_winmlProvider->TransitionResourcesForOperator(
@@ -2146,8 +2146,8 @@ namespace Windows::AI::MachineLearning::Adapter
 
             auto dml_gpu_allocator = static_cast<Dml::DmlGpuAllocator*>(alloc.get());
             auto buffer = dml_gpu_allocator->AllocateDefaultBuffer(size);
-            buffer.ResourceInUavState()->AddRef();
-            *abiAllocation = buffer.ResourceInUavState();
+            buffer.GetD3D12Resource()->AddRef();
+            *abiAllocation = buffer.GetD3D12Resource();
 
             // Ensure the allocation is freed and transitioned when the context destructs
             m_temporaryBuffers.push_back(std::move(buffer));
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 1e3035648adcb..9c8d8b4d539c9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -736,13 +736,13 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         D3D12_RESOURCE_BARRIER barriers[2];
 
         barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
+            inputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
 
         barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
+            outputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
@@ -786,13 +786,13 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
 
         // Transition resources to common state
         barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
+            inputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
 
         barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
+            outputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
@@ -821,13 +821,13 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         D3D12_RESOURCE_BARRIER barriers[2];
 
         barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
+            inputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
 
         barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
+            outputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
@@ -870,7 +870,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             constants.ElementCount = totalElementCount / constants.OutputSizes[3];
             constants.DFTIteration = index + 1;
             constants.ChirpLength = isLastPass ? chirpLength : 0;
-            constants.HasWindow = isFirstPass && windowBufferRegion.ResourceInUavState() != nullptr;
+            constants.HasWindow = isFirstPass && windowBufferRegion.GetD3D12Resource() != nullptr;
             auto window = constants.HasWindow ? windowBufferRegion : out;
             std::array<Dml::D3D12BufferRegion, 3> uav_resources = { in, out, window };
             Dispatch(uav_resources, constants, commandList);
@@ -878,13 +878,13 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
 
         // Transition resources to common state
         barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
+            inputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
 
         barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
+            outputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
@@ -911,7 +911,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
         std::transform(
             bufferRegions.begin(), bufferRegions.end(),
             uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.GetD3D12Resource()); } );
         commandList->ResourceBarrier(TSize, uav_barriers);
 
         for (uint32_t i = 0; i < TSize; i++)
@@ -920,7 +920,7 @@ class GpuDFTOperator : public WRL::Base<IMLOperatorKernel>
             if (bufferRegions[i]) {
                 commandList->SetComputeRootUnorderedAccessView(
                     i, // root parameter index
-                    bufferRegions[i].ResourceInUavState()->GetGPUVirtualAddress() + bufferRegions[i].Offset()
+                    bufferRegions[i].GetD3D12Resource()->GetGPUVirtualAddress() + bufferRegions[i].Offset()
                 );
             }
             else
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 0611c4b7bf7f7..29cf439284479 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -687,19 +687,19 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         D3D12_RESOURCE_BARRIER barriers[3];
 
         barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
+            inputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
 
         barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridBufferRegion.ResourceInUavState(),
+            gridBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
 
         barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
+            outputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_COMMON,
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS
         );
@@ -729,19 +729,19 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
 
         // Transition resources to common state
         barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(
-            inputBufferRegion.ResourceInUavState(),
+            inputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
 
         barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(
-            gridBufferRegion.ResourceInUavState(),
+            gridBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
 
         barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(
-            outputBufferRegion.ResourceInUavState(),
+            outputBufferRegion.GetD3D12Resource(),
             D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
             D3D12_RESOURCE_STATE_COMMON
         );
@@ -768,7 +768,7 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
         std::transform(
             bufferRegions.begin(), bufferRegions.end(),
             uav_barriers,
-            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.ResourceInUavState()); } );
+            [](auto& bufferRegion) { return CD3DX12_RESOURCE_BARRIER::UAV(bufferRegion.GetD3D12Resource()); } );
         commandList->ResourceBarrier(TSize, uav_barriers);
 
         for (uint32_t i = 0; i < TSize; i++)
@@ -777,7 +777,7 @@ class DmlGridSampleOperator : public WRL::Base<IMLOperatorKernel>
             if (bufferRegions[i]) {
                 commandList->SetComputeRootUnorderedAccessView(
                     i, // root parameter index
-                    bufferRegions[i].ResourceInUavState()->GetGPUVirtualAddress() + bufferRegions[i].Offset()
+                    bufferRegions[i].GetD3D12Resource()->GetGPUVirtualAddress() + bufferRegions[i].Offset()
                 );
             }
             else
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index 945b58965cf2f..780a6fe0f5223 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -413,7 +413,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
         Dml::D3D12BufferRegion signalBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Signal);
         inputBuffers[0] = signalBufferRegion.GetBufferBinding();
         inputBindings[0] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[0] };
-        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(signalBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         Dml::D3D12BufferRegion windowBufferRegion;
         if (m_framingOperator.hasWindowTensor)
@@ -421,7 +421,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
             windowBufferRegion = DmlSTFTHelpers::GetInputBufferRegionFromKernelContext(context, DmlSTFTKernelInputIndex::Window);
             inputBuffers[1] = windowBufferRegion.GetBufferBinding();
             inputBindings[1] = { DML_BINDING_TYPE_BUFFER, &inputBuffers[1] };
-            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+            barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(windowBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
             inputBindingsCount++;
         }
 
@@ -429,7 +429,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         DML_BUFFER_BINDING outputBuffer = outputBufferRegion.GetBufferBinding();
         DML_BINDING_DESC outputBinding = { DML_BINDING_TYPE_BUFFER, &outputBuffer };
-        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.ResourceInUavState(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+        barriers[barrierCount++] = CD3DX12_RESOURCE_BARRIER::Transition(outputBufferRegion.GetD3D12Resource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 
         m_framingOperator.bindingTable->BindOutputs(1, &outputBinding);
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index b2d02715bb91d..6d723a6a6b948 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -266,7 +266,7 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation,
     ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT);
     auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), allocation, size_in_bytes);
     *offset = bufferRegion.Offset();
-    *d3d_resource = bufferRegion.ResourceInUavState();
+    *d3d_resource = bufferRegion.GetD3D12Resource();
   }
 
   (*d3d_resource)->AddRef();
diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
index a2d7ff6947f9a..5b0cbd414dd7a 100644
--- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
+++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
@@ -564,8 +564,8 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
     telemetryLogger.emplace(tensorDesc);
   }
 
-  uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
-  uint32_t singleVideoFramebufferSize = static_cast<uint32_t>(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize);
+  uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
+  uint64_t singleVideoFramebufferSize = tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize;
 
   // TODO: Make an allocator for readback heaps
   if (!readback_heap_ || readback_heap_->GetDesc().Width < singleVideoFramebufferSize) {
@@ -582,7 +582,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
 
   auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pInputTensor, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   command_list_->ResourceBarrier(1, &barrier);
-  command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, inputTensorOffset + singleVideoFramebufferSize * batchIdx, singleVideoFramebufferSize);
+  command_list_->CopyBufferRegion(readback_heap_.Get(), 0, pInputTensor, inputTensorOffset + singleVideoFramebufferSize * static_cast<uint64_t>(batchIdx), singleVideoFramebufferSize);
 
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index 4767228579b0b..becb47ed6d56c 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -527,7 +527,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST);
   command_list_->ResourceBarrier(1, &barrier);
 
-  command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx + outputResourceOffset, upload_heap_.Get(), 0, bufferSize);
+  command_list_->CopyBufferRegion(pOutputResource, bufferSize * static_cast<uint64_t>(batchIdx) + outputResourceOffset, upload_heap_.Get(), 0, bufferSize);
 
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};

From 8f37e382260c91019aa8a53f7b8a24834b973d24 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 13:41:49 -0700
Subject: [PATCH 54/76] Fix destructors

---
 .../dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h | 5 ++---
 .../src/DmlReservedResourceSubAllocator.h                    | 5 ++---
 .../providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 17524c83c6094..f0fc570d4e1c4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -19,6 +19,8 @@ namespace Dml
     class BucketizedBufferAllocator : public onnxruntime::IAllocator, public DmlSubAllocator
     {
     public:
+        ~BucketizedBufferAllocator();
+
         // Constructs a BucketizedBufferAllocator which allocates D3D12 committed resources with the specified heap properties,
         // resource flags, and initial resource state.
         BucketizedBufferAllocator(
@@ -44,9 +46,6 @@ namespace Dml
         void* Alloc(size_t size) final;
         void Free(void* p) final;
 
-    protected:
-        ~BucketizedBufferAllocator();
-
     private:
         static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 249de73de6487..3f2f1c9210c64 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -70,6 +70,8 @@ namespace Dml
         bool TilingEnabled() const { return tiling_enabled_; };
         uint64_t GetUniqueId(void* opaquePointer);
 
+        ~DmlReservedResourceSubAllocator();
+
         // Constructs a DmlReservedResourceSubAllocator which allocates D3D12 committed resources with the specified heap properties,
         // resource flags, and initial resource state.
         DmlReservedResourceSubAllocator(
@@ -81,9 +83,6 @@ namespace Dml
         void* Alloc(size_t size);
         void Free(void* p);
 
-    protected:
-        ~DmlReservedResourceSubAllocator();
-
     private:
         static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
index 580830ea1a90f..d6aa49d51c3f8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlSubAllocator.h
@@ -11,5 +11,6 @@ namespace Dml
     {
     public:
         virtual void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) = 0;
+        virtual ~DmlSubAllocator() = default;
     };
 }

From a67641c259aa94fab8417ce9d6b167cae96363ff Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 13 Jul 2023 17:11:25 -0700
Subject: [PATCH 55/76] Fix typo

---
 .../providers/dml/DmlExecutionProvider/src/CommandQueue.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
index e5084772d4063..95190e9dca2a2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -51,7 +51,7 @@ namespace Dml
         // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK
         // to queue additional references at this time, since those references would be leaked. This
         // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference;
-        // for example, an allocation from DmlReservedResourceSubAllocator attempts to queue a reference
+        // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference
         // to its underlying D3D resource when freed. Furthermore, these references are unnecessary
         // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences.
         if (!m_closing)

From e4e34e0cb513dfdf4bf0e665d1032bbf1e0d6acc Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Sat, 29 Jul 2023 14:08:07 -0700
Subject: [PATCH 56/76] Fix build break

---
 winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp | 14 +++-----------
 winml/test/common/SqueezeNetValidator.cpp          |  7 -------
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
index 5c96cb750f2b9..a985aa002c915 100644
--- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
@@ -39,29 +39,21 @@ OnnxruntimeDmlSessionBuilder::CreateSessionOptions(OrtSessionOptions** options)
 
   auto session_options = UniqueOrtSessionOptions(ort_options, ort_api->ReleaseSessionOptions);
 
-    // set the graph optimization level to all (used to be called level 3)
+  // set the graph optimization level to all (used to be called level 3)
   RETURN_HR_IF_NOT_OK_MSG(
     ort_api->SetSessionGraphOptimizationLevel(session_options.get(), GraphOptimizationLevel::ORT_ENABLE_ALL), ort_api
   );
 
-    // Disable the mem pattern session option for DML. It will cause problems with how memory is allocated.
+  // Disable the mem pattern session option for DML. It will cause problems with how memory is allocated.
   RETURN_HR_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api);
 
-    // Request the dml ep
-<<<<<<< HEAD
+  // Request the dml ep
   RETURN_HR_IF_NOT_OK_MSG(
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
       session_options.get(), device_.get(), queue_.get(), metacommands_enabled_, bfc_allocator_enabled_
     ),
     ort_api
   );
-  == == == = RETURN_HR_IF_NOT_OK_MSG(
-             winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
-               session_options.get(), device_.get(), queue_.get(), metacommands_enabled_
-             ),
-             ort_api
-           );
->>>>>>> 92b6e10d37c50638d59620c5a315c6e75b47131c
 
 #ifndef _WIN64
   auto use_arena = false;
diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp
index 5c50651d92e6b..d0c43c1c9775e 100644
--- a/winml/test/common/SqueezeNetValidator.cpp
+++ b/winml/test/common/SqueezeNetValidator.cpp
@@ -190,13 +190,6 @@ void ModelValidator::SqueezeNet(
   LearningModelSession modelSession = nullptr;
   modelSession = LearningModelSession(model, LearningModelDevice(deviceKind));
 
-  // WinML model creation
-  LearningModel model = nullptr;
-  model = LearningModel::LoadFromFilePath(fullModelPath);
-
-  LearningModelSession modelSession = nullptr;
-  modelSession = LearningModelSession(model, LearningModelDevice(deviceKind));
-
   LearningModelBinding modelBinding(modelSession);
 
   if (bindAsImage) {

From 0b4cee09972706f17f73a86fd550fa7a09ebe96a Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Sat, 29 Jul 2023 20:55:27 -0700
Subject: [PATCH 57/76] Fix lint errors

---
 winml/adapter/winml_adapter_c_api.h                   | 6 +++++-
 winml/adapter/winml_adapter_dml.cpp                   | 2 +-
 winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h | 4 +---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/winml/adapter/winml_adapter_c_api.h b/winml/adapter/winml_adapter_c_api.h
index d8fd6276a75f7..fac53da21b668 100644
--- a/winml/adapter/winml_adapter_c_api.h
+++ b/winml/adapter/winml_adapter_c_api.h
@@ -294,7 +294,11 @@ struct WinmlAdapterApi {
 	 * This api is used to add the DML EP to OrtSessionOptions.
     */
   OrtStatus*(ORT_API_CALL* OrtSessionOptionsAppendExecutionProvider_DML)(
-    _In_ OrtSessionOptions* options, ID3D12Device* device, ID3D12CommandQueue* queue, bool metacommands_enabled, bool bfc_allocator_enabled
+    _In_ OrtSessionOptions* options,
+    ID3D12Device* device,
+    ID3D12CommandQueue* queue,
+    bool metacommands_enabled,
+    bool bfc_allocator_enabled
   )NO_EXCEPTION;
 
   // OrtSession methods
diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp
index 25bbe95a66a2c..f32507a308e3c 100644
--- a/winml/adapter/winml_adapter_dml.cpp
+++ b/winml/adapter/winml_adapter_dml.cpp
@@ -73,7 +73,7 @@ void DmlConfigureProviderFactoryDefaultRoundingMode(
 );
 void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled);
 void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled);
-}
+} // namespace onnxruntime
 
 #endif  // USE_DML
 
diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
index 167aeffb483ec..4433dfaab299d 100644
--- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
+++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
@@ -86,9 +86,7 @@ class VideoFrameToTensorConverter : public ImageConverter {
   );
 
   static D3D12_UNORDERED_ACCESS_VIEW_DESC CreateUAVDescription(
-    uint64_t offset,
-    const UINT32 batch_index,
-    const ImageTensorDescription& description
+    uint64_t offset, const UINT32 batch_index, const ImageTensorDescription& description
   );
 
   static void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor(

From e14797c5e84ab4cd536036353e84ac286820a1fd Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Sat, 29 Jul 2023 22:15:32 -0700
Subject: [PATCH 58/76] Fix iobinding crash

---
 .../src/DmlExternalGpuAllocator.cpp           | 43 ++++++++++++++++---
 .../src/DmlExternalGpuAllocator.h             |  5 ++-
 .../src/ExecutionProvider.cpp                 |  2 +-
 3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
index 0ebe2c3d00e5e..c30b4d19d2f73 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
@@ -5,29 +5,58 @@
 
 #include "precomp.h"
 #include "DmlExternalGpuAllocator.h"
+#include "DmlResourceWrapper.h"
+#include "DmlCommittedResourceWrapper.h"
+#include "DmlAllocationInfo.h"
 
 namespace Dml
 {
-    DmlExternalGpuAllocator::DmlExternalGpuAllocator()
+    DmlExternalGpuAllocator::DmlExternalGpuAllocator(ID3D12Device* device)
     : onnxruntime::IAllocator(
         OrtMemoryInfo(
             onnxruntime::DML,
             OrtAllocatorType::OrtDeviceAllocator,
             OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, 0),
             -1
-        )
-    ) {}
+        )),
+        m_device(device)
+    {
+    }
 
     void* DmlExternalGpuAllocator::Alloc(size_t size_in_bytes)
     {
-        // This allocator should never be used to allocate memory; it should only be use to decode the opaque data pointer
-        THROW_HR(E_INVALIDARG);
+        Microsoft::WRL::ComPtr<ID3D12Resource> resource;
+        auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+        auto props = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT);
+        ORT_THROW_IF_FAILED(m_device->CreateCommittedResource(
+            &props,
+            D3D12_HEAP_FLAG_NONE,
+            &buffer,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            nullptr,
+            IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf())
+        ));
+
+        const uint64_t resourceWidth = resource->GetDesc().Width;
+        constexpr uint64_t pooledResourceId = 0; // Not a pooled resource
+
+        Microsoft::WRL::ComPtr<DmlResourceWrapper> resourceWrapper;
+        wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
+
+        Microsoft::WRL::ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
+            nullptr,
+            0,
+            pooledResourceId,
+            resourceWrapper.Get(),
+            static_cast<size_t>(resourceWidth));
+
+        return allocInfo.Detach();
     }
 
     void DmlExternalGpuAllocator::Free(void* ptr)
     {
-        // This allocator should never be used to free memory; it should only be use to decode the opaque data pointer
-        THROW_HR(E_INVALIDARG);
+        Microsoft::WRL::ComPtr<AllocationInfo> resource;
+        resource.Attach(static_cast<AllocationInfo*>(ptr));
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
index 1c4d4b36628eb..9dbb87ef04aa2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
@@ -14,9 +14,12 @@ namespace Dml
     class DmlExternalGpuAllocator : public onnxruntime::IAllocator
     {
     public:
-        DmlExternalGpuAllocator();
+        DmlExternalGpuAllocator(ID3D12Device* device);
 
         void* Alloc(size_t size_in_bytes) final;
         void Free(void* ptr) final;
+
+    private:
+        Microsoft::WRL::ComPtr<ID3D12Device> m_device;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index e8b198d369c01..d824aa8185705 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -232,7 +232,7 @@ namespace Dml
             m_context->SetAllocator(m_gpuAllocator);
             // CPU Allocator used to create buffers for the MemcpyFromHost, Shape and Size operators.
             m_cpuInputAllocator = std::make_shared<DmlCpuAllocator>(OrtMemType::OrtMemTypeCPUInput);
-            m_externalGpuAllocator = std::make_shared<DmlExternalGpuAllocator>();
+            m_externalGpuAllocator = std::make_shared<DmlExternalGpuAllocator>(m_d3d12Device.Get());
         }
 
         return std::vector<onnxruntime::AllocatorPtr>{m_gpuAllocator, m_externalGpuAllocator, m_cpuInputAllocator};

From 16c9524ba865f2d16192963a227c8e1db0af01c7 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 31 Jul 2023 18:05:17 -0700
Subject: [PATCH 59/76] Add aliasing support to DmlOperatorCopy

---
 .../src/MLOperatorAuthorImpl.cpp              | 18 ++++++-
 .../src/MLOperatorAuthorImpl.h                | 12 +++--
 .../src/Operators/DmlOperatorCopy.cpp         | 49 ++++++++++++++++---
 .../MLOperatorAuthorPrivate.h                 | 12 +++++
 4 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 86069139d7f69..320f527f7ea46 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -557,7 +557,7 @@ namespace Windows::AI::MachineLearning::Adapter
         const AttributeMap* defaultAttributes,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter,
-        const onnxruntime::OpKernelContext* kernelContext
+        onnxruntime::OpKernelContext* kernelContext
         )
     :   OpNodeInfoWrapper(kerneInfo, inputShapeOverrides, defaultAttributes, requiredConstantCpuInputs, constantInputGetter, kernelContext),
         m_inferredOutputShapes(inferredOutputShapes),
@@ -1335,6 +1335,22 @@ namespace Windows::AI::MachineLearning::Adapter
         return m_allowOutputShapeQuery;
     }
 
+    HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputAliasesOutput(
+        uint32_t inputIndex,
+        uint32_t outputIndex,
+        const onnxruntime::TensorShape& outputShape,
+        bool* aliasing) noexcept
+    {
+        ORT_TRY
+        {
+            auto inputData = m_kernelContext->Input<onnxruntime::Tensor>(inputIndex)->DataRaw();
+            auto outputData = m_kernelContext->Output(outputIndex, outputShape)->DataRaw();
+            *aliasing = inputData == outputData;
+            return S_OK;
+        }
+        ORT_CATCH_RETURN
+    }
+
     DmlGraphOpKernelInfoWrapper::DmlGraphOpKernelInfoWrapper(
         const onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext>* protoHelper,
         const void* executionHandle,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 85b6b197fe511..c378db886f690 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -176,7 +176,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable
         const AttributeMap* defaultAttributes,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter,
-        const onnxruntime::OpKernelContext* kernelContext = nullptr
+        onnxruntime::OpKernelContext* kernelContext = nullptr
         )
     :   m_impl(impl),
         m_kernelContext(kernelContext),
@@ -245,7 +245,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable
  protected:
     // Lifetime is managed by the caller and guaranteed to outlive this class
     const onnxruntime::OpNodeProtoHelper<NodeInfoImpl_t>* m_impl = nullptr;
-    const onnxruntime::OpKernelContext* m_kernelContext = nullptr;
+    mutable onnxruntime::OpKernelContext* m_kernelContext = nullptr;
 
  private:
     template <MLOperatorAttributeType T>
@@ -362,7 +362,7 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper<
         const AttributeMap* defaultAttributes,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter,
-        const onnxruntime::OpKernelContext* kernelContext = nullptr
+        onnxruntime::OpKernelContext* kernelContext = nullptr
     );
 
     // HasTensorShapeDescription returns false if and only if the kernel is registered using
@@ -405,6 +405,12 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper<
         return m_winmlProvider.CopyTo(executionProvider);
     }
 
+    HRESULT STDMETHODCALLTYPE InputAliasesOutput(
+        uint32_t inputIndex,
+        uint32_t outputIndex,
+        const onnxruntime::TensorShape& outputShape,
+        bool* aliasing) noexcept override;
+
 private:
     // For shape info, in addition to the info
     const EdgeShapes* m_inferredOutputShapes = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index f8ef496b74d9b..7ac944616d26b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -16,7 +16,7 @@ class DmlOperatorCopy : public DmlOperator
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() >= 1);
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() == 1);
 
-        std::vector<std::optional<uint32_t>> kernelInputOutputIndices  = {0};
+        std::vector<std::optional<uint32_t>> kernelInputOutputIndices = {0};
 
         Initialize(kernelInfo, kernelInputOutputIndices);
 
@@ -29,14 +29,49 @@ class DmlOperatorCopy : public DmlOperator
         ComPtr<IMLOperatorKernelCreationContextPrivate> contextPrivate;
         ORT_THROW_IF_FAILED(kernelInfo.GetInterface()->QueryInterface(contextPrivate.GetAddressOf()));
 
-        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
-        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+        // We don't need to compile any operator if the input aliases the output as it is essentially a no-op
+        // (e.g. squeeze/unsqueeze/reshape). An exception to this rule is when the operator is part of the graph,
+        // in which case we always need to compile and execute the operator (although this is something that we
+        // could optimize in the future).
 
-        DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {};
-        opDesc.InputTensor = inputDescs.data();
-        opDesc.OutputTensor = outputDescs.data();
+        bool aliasing = false;
 
-        SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo);
+        if (!contextPrivate->IsDmlGraphNode())
+        {
+            std::vector<uint32_t> outputSizes = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
+            std::vector<int64_t> outputSizesInt64(outputSizes.begin(), outputSizes.end());
+            onnxruntime::TensorShape outputShape(outputSizesInt64);
+            ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &aliasing));
+        }
+
+        if (!aliasing)
+        {
+            std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+            std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+            DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {};
+            opDesc.InputTensor = inputDescs.data();
+            opDesc.OutputTensor = outputDescs.data();
+
+            SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo);
+        }
+    }
+
+    void Compute(const MLOperatorKernelContext& kernelContext)
+    {
+        MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
+
+        // Reshape the output tensor.
+        MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
+
+        // Avoid self copying.
+        if (inputTensor.GetByteData() != outputTensor.GetByteData())
+        {
+            // Copy elements from input tensor to output tensor.
+            ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
+                outputTensor.GetInterface().Get(),
+                inputTensor.GetInterface().Get()));
+        }
     }
 };
 
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index 9909be1f8337f..d86fdff8ac7e1 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -10,6 +10,11 @@ struct DML_INPUT_GRAPH_EDGE_DESC;
 struct DML_OUTPUT_GRAPH_EDGE_DESC;
 struct DML_INTERMEDIATE_GRAPH_EDGE_DESC;
 
+namespace onnxruntime
+{
+    class TensorShape;
+}
+
 // Either nodesAsOpDesc or nodesAsIDMLOperator is present.
 //  1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC.
 //     These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the
@@ -106,6 +111,13 @@ IMLOperatorKernelCreationContextNodeWrapperPrivate : public IMLOperatorKernelCre
     STDMETHOD(GetExecutionProvider)(
         _Outptr_result_maybenull_ IUnknown** executionProvider
         ) const noexcept PURE;
+
+    STDMETHOD(InputAliasesOutput)(
+        _In_ uint32_t inputIndex,
+        _In_ uint32_t outputIndex,
+        _In_ const onnxruntime::TensorShape& outputShape,
+        _Out_ bool* aliasing
+    ) noexcept PURE;
 };
 
 //! \interface IMLOperatorAttributes1

From a95505f024b18d3cfe68e41e3751951bfb66aca2 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 31 Jul 2023 19:57:56 -0700
Subject: [PATCH 60/76] Use identity instead of 2 copies

---
 .../src/Operators/DmlOperatorCopy.cpp                    | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index 7ac944616d26b..bb45cdfef25c1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -57,20 +57,15 @@ class DmlOperatorCopy : public DmlOperator
         }
     }
 
-    void Compute(const MLOperatorKernelContext& kernelContext)
+    void Compute(const MLOperatorKernelContext& kernelContext) final
     {
         MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
-
-        // Reshape the output tensor.
         MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
 
         // Avoid self copying.
         if (inputTensor.GetByteData() != outputTensor.GetByteData())
         {
-            // Copy elements from input tensor to output tensor.
-            ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
-                outputTensor.GetInterface().Get(),
-                inputTensor.GetInterface().Get()));
+            DmlOperator::Compute(kernelContext);
         }
     }
 };

From 759442b0f7a32a0a8ed5c81ee98193015c20428e Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 2 Aug 2023 15:17:17 -0700
Subject: [PATCH 61/76] Enable copy-less I/O binding

---
 onnxruntime/core/framework/utils.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index d762211f7816b..3d67e49e155d9 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -161,6 +161,14 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
     return Status::OK();
   }
 
+#ifdef USE_DML
+  // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device
+  if (copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU && copy_info.source_device.Id() == copy_info.target_device.Id()) {
+    target_mlvalue = source_mlvalue;
+    return Status::OK();
+  }
+#endif
+
   auto allocator = session_state.GetAllocator(copy_info.target_device);
   if (!target_mlvalue.IsAllocated()) {
     ORT_ENFORCE(allocator != nullptr, "Failed to find allocator for device ", copy_info.target_device.ToString());

From 9f3e430aa220d8d69e25ed50333d027d728012f9 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 2 Aug 2023 15:17:55 -0700
Subject: [PATCH 62/76] Fix nonzero coordinates operator

---
 .../src/Operators/DmlOperatorNonZero.cpp                     | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
index 61623dfe2b4dd..c9d215d097b4e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
@@ -122,11 +122,8 @@ class DmlOperatorNonZero: public DmlOperator
 
         if (!m_emptyInput && nonzeroElementCount > 0)
         {
-            std::vector<DimensionType> outputCoordinatesStrides = {nonzeroElementCount * 2, 2};
-            TensorDesc stridedOutputTensorDesc(DML_TENSOR_DATA_TYPE_UINT32, outputSizes, outputCoordinatesStrides);
-
             // TODO: Remove this hack when DML supports native int64 for NonZero
-            m_zeroOperator = InitializeZeroInt64Tensor(stridedOutputTensorDesc.GetBufferSizeInBytes());
+            m_zeroOperator = InitializeZeroInt64Tensor(m_rank * nonzeroElementCount * sizeof(int64_t));
             ExecuteZeroInt64Tensor(m_zeroOperator.Get(), outputTensor.GetInterface().Get());
 
             ComPtr<IDMLCompiledOperator> sliceOperator = InitializeSlice(m_intermediateTensorDescs[1], nonzeroElementCount);

From 2da8999b4d57d92ec949eaf6a5f10dab1e59e5af Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 2 Aug 2023 23:11:10 -0700
Subject: [PATCH 63/76] Fix If test crash

---
 .../src/MLOperatorAuthorImpl.cpp              | 45 +++++++++++++++++++
 .../src/MLOperatorAuthorImpl.h                |  6 +++
 .../src/Operators/DmlOperatorCopy.cpp         | 35 +++++++++++----
 .../MLOperatorAuthorPrivate.h                 |  7 +++
 4 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 320f527f7ea46..0f29409df1e2b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1343,6 +1343,12 @@ namespace Windows::AI::MachineLearning::Adapter
     {
         ORT_TRY
         {
+            if (!m_kernelContext)
+            {
+                *aliasing = false;
+                return S_OK;
+            }
+
             auto inputData = m_kernelContext->Input<onnxruntime::Tensor>(inputIndex)->DataRaw();
             auto outputData = m_kernelContext->Output(outputIndex, outputShape)->DataRaw();
             *aliasing = inputData == outputData;
@@ -1351,6 +1357,45 @@ namespace Windows::AI::MachineLearning::Adapter
         ORT_CATCH_RETURN
     }
 
+    HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputSharesOutputBuffer(
+        uint32_t inputIndex,
+        uint32_t outputIndex,
+        const onnxruntime::TensorShape& outputShape,
+        bool* sharesOutputBuffer) noexcept
+    {
+        ORT_TRY
+        {
+            if (!m_kernelContext)
+            {
+                *sharesOutputBuffer = false;
+                return S_OK;
+            }
+
+            auto inputTensor = const_cast<onnxruntime::Tensor*>(m_kernelContext->Input<onnxruntime::Tensor>(inputIndex));
+            auto outputTensor = m_kernelContext->Output(outputIndex, outputShape);
+
+            // Null input or output data means that the tensors are empty (i.e. one of the dimensions is 0)
+            if (inputTensor->DataRaw() == nullptr || outputTensor->DataRaw() == nullptr)
+            {
+                *sharesOutputBuffer = false;
+                return S_OK;
+            }
+
+            auto inputWrapper = wil::MakeOrThrow<TensorWrapper>(inputTensor, true, m_winmlProvider.Get(), true);
+            auto outputWrapper = wil::MakeOrThrow<TensorWrapper>(outputTensor, true, m_winmlProvider.Get(), true);
+
+            ComPtr<IUnknown> inputResource;
+            inputWrapper->GetDataInterface(inputResource.GetAddressOf());
+
+            ComPtr<IUnknown> outputResource;
+            outputWrapper->GetDataInterface(outputResource.GetAddressOf());
+
+            *sharesOutputBuffer = inputResource.Get() == outputResource.Get();
+            return S_OK;
+        }
+        ORT_CATCH_RETURN
+    }
+
     DmlGraphOpKernelInfoWrapper::DmlGraphOpKernelInfoWrapper(
         const onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext>* protoHelper,
         const void* executionHandle,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index f9c0afd20f7e4..e12a4435bd747 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -411,6 +411,12 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper<
         const onnxruntime::TensorShape& outputShape,
         bool* aliasing) noexcept override;
 
+    HRESULT STDMETHODCALLTYPE InputSharesOutputBuffer(
+        uint32_t inputIndex,
+        uint32_t outputIndex,
+        const onnxruntime::TensorShape& outputShape,
+        bool* sharesOutputBuffer) noexcept override;
+
 private:
     // For shape info, in addition to the info
     const EdgeShapes* m_inferredOutputShapes = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index bb45cdfef25c1..c55cf60abb873 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -33,18 +33,16 @@ class DmlOperatorCopy : public DmlOperator
         // (e.g. squeeze/unsqueeze/reshape). An exception to this rule is when the operator is part of the graph,
         // in which case we always need to compile and execute the operator (although this is something that we
         // could optimize in the future).
-
-        bool aliasing = false;
-
         if (!contextPrivate->IsDmlGraphNode())
         {
             std::vector<uint32_t> outputSizes = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
             std::vector<int64_t> outputSizesInt64(outputSizes.begin(), outputSizes.end());
             onnxruntime::TensorShape outputShape(outputSizesInt64);
-            ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &aliasing));
+            ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &m_aliasing));
+            ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputSharesOutputBuffer(0, 0, outputShape, &m_inputSharesOutputBuffer));
         }
 
-        if (!aliasing)
+        if (contextPrivate->IsDmlGraphNode() || (!m_aliasing && m_inputSharesOutputBuffer))
         {
             std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
             std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
@@ -59,15 +57,34 @@ class DmlOperatorCopy : public DmlOperator
 
     void Compute(const MLOperatorKernelContext& kernelContext) final
     {
-        MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
-        MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
+        // If the input is aliasing the output, we don't need to do anything here
+        if (m_aliasing)
+        {
+            return;
+        }
 
-        // Avoid self copying.
-        if (inputTensor.GetByteData() != outputTensor.GetByteData())
+        // If the input and the output share the same buffer, we need to do an identity operation
+        if (m_inputSharesOutputBuffer)
         {
             DmlOperator::Compute(kernelContext);
+            return;
         }
+
+        // If the input and the output don't share the same buffer, we can do a standard copy operation instead
+        MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
+        MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
+
+        ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
+            outputTensor.GetInterface().Get(),
+            inputTensor.GetInterface().Get()));
     }
+
+private:
+    // Aliasing means that both the input and the output start at the same exact offset in the same buffer
+    bool m_aliasing = false;
+
+    // The choice of using Identity or a copy depends on whether the input and the input are located in the same buffer
+    bool m_inputSharesOutputBuffer = false;
 };
 
 DML_OP_DEFINE_CREATION_FUNCTION(Copy, DmlOperatorCopy);
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index d86fdff8ac7e1..5640c9f30283b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -118,6 +118,13 @@ IMLOperatorKernelCreationContextNodeWrapperPrivate : public IMLOperatorKernelCre
         _In_ const onnxruntime::TensorShape& outputShape,
         _Out_ bool* aliasing
     ) noexcept PURE;
+
+    STDMETHOD(InputSharesOutputBuffer)(
+        _In_ uint32_t inputIndex,
+        _In_ uint32_t outputIndex,
+        _In_ const onnxruntime::TensorShape& outputShape,
+        _Out_ bool* sharesBuffer
+    ) noexcept PURE;
 };
 
 //! \interface IMLOperatorAttributes1

From 26a94e17f96a2ed5629426a8f03a6a8ac3a275d4 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 3 Aug 2023 10:41:36 -0700
Subject: [PATCH 64/76] Fix output binding crash

---
 onnxruntime/core/framework/utils.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 3d67e49e155d9..9ad0bbdac2305 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -162,8 +162,12 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
   }
 
 #ifdef USE_DML
+  const bool bothValuesOnGPU = copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU;
+  const bool targetIsInternalAlloc = copy_info.target_device.MemType() == OrtDevice::MemType::DEFAULT;
+  const bool bothValuesOnSameDevice = copy_info.source_device.Id() == copy_info.target_device.Id();
+
   // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device
-  if (copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU && copy_info.source_device.Id() == copy_info.target_device.Id()) {
+  if (bothValuesOnGPU && targetIsInternalAlloc && bothValuesOnSameDevice) {
     target_mlvalue = source_mlvalue;
     return Status::OK();
   }

From 31270e6fd594a03016aa184b4437848e38d743fe Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 3 Aug 2023 20:48:08 -0700
Subject: [PATCH 65/76] Fix test failures

---
 .../src/MLOperatorAuthorImpl.cpp              | 61 -------------------
 .../src/MLOperatorAuthorImpl.h                | 12 ----
 .../src/Operators/DmlOperatorCopy.cpp         | 59 ++++++++----------
 .../MLOperatorAuthorPrivate.h                 | 14 -----
 4 files changed, 24 insertions(+), 122 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 0f29409df1e2b..4b749acf4ae33 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1335,67 +1335,6 @@ namespace Windows::AI::MachineLearning::Adapter
         return m_allowOutputShapeQuery;
     }
 
-    HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputAliasesOutput(
-        uint32_t inputIndex,
-        uint32_t outputIndex,
-        const onnxruntime::TensorShape& outputShape,
-        bool* aliasing) noexcept
-    {
-        ORT_TRY
-        {
-            if (!m_kernelContext)
-            {
-                *aliasing = false;
-                return S_OK;
-            }
-
-            auto inputData = m_kernelContext->Input<onnxruntime::Tensor>(inputIndex)->DataRaw();
-            auto outputData = m_kernelContext->Output(outputIndex, outputShape)->DataRaw();
-            *aliasing = inputData == outputData;
-            return S_OK;
-        }
-        ORT_CATCH_RETURN
-    }
-
-    HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::InputSharesOutputBuffer(
-        uint32_t inputIndex,
-        uint32_t outputIndex,
-        const onnxruntime::TensorShape& outputShape,
-        bool* sharesOutputBuffer) noexcept
-    {
-        ORT_TRY
-        {
-            if (!m_kernelContext)
-            {
-                *sharesOutputBuffer = false;
-                return S_OK;
-            }
-
-            auto inputTensor = const_cast<onnxruntime::Tensor*>(m_kernelContext->Input<onnxruntime::Tensor>(inputIndex));
-            auto outputTensor = m_kernelContext->Output(outputIndex, outputShape);
-
-            // Null input or output data means that the tensors are empty (i.e. one of the dimensions is 0)
-            if (inputTensor->DataRaw() == nullptr || outputTensor->DataRaw() == nullptr)
-            {
-                *sharesOutputBuffer = false;
-                return S_OK;
-            }
-
-            auto inputWrapper = wil::MakeOrThrow<TensorWrapper>(inputTensor, true, m_winmlProvider.Get(), true);
-            auto outputWrapper = wil::MakeOrThrow<TensorWrapper>(outputTensor, true, m_winmlProvider.Get(), true);
-
-            ComPtr<IUnknown> inputResource;
-            inputWrapper->GetDataInterface(inputResource.GetAddressOf());
-
-            ComPtr<IUnknown> outputResource;
-            outputWrapper->GetDataInterface(outputResource.GetAddressOf());
-
-            *sharesOutputBuffer = inputResource.Get() == outputResource.Get();
-            return S_OK;
-        }
-        ORT_CATCH_RETURN
-    }
-
     DmlGraphOpKernelInfoWrapper::DmlGraphOpKernelInfoWrapper(
         const onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext>* protoHelper,
         const void* executionHandle,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index e12a4435bd747..4f982c80c4c5c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -405,18 +405,6 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper<
         return m_winmlProvider.CopyTo(executionProvider);
     }
 
-    HRESULT STDMETHODCALLTYPE InputAliasesOutput(
-        uint32_t inputIndex,
-        uint32_t outputIndex,
-        const onnxruntime::TensorShape& outputShape,
-        bool* aliasing) noexcept override;
-
-    HRESULT STDMETHODCALLTYPE InputSharesOutputBuffer(
-        uint32_t inputIndex,
-        uint32_t outputIndex,
-        const onnxruntime::TensorShape& outputShape,
-        bool* sharesOutputBuffer) noexcept override;
-
 private:
     // For shape info, in addition to the info
     const EdgeShapes* m_inferredOutputShapes = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index c55cf60abb873..8fa3c74674776 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -29,54 +29,43 @@ class DmlOperatorCopy : public DmlOperator
         ComPtr<IMLOperatorKernelCreationContextPrivate> contextPrivate;
         ORT_THROW_IF_FAILED(kernelInfo.GetInterface()->QueryInterface(contextPrivate.GetAddressOf()));
 
-        // We don't need to compile any operator if the input aliases the output as it is essentially a no-op
-        // (e.g. squeeze/unsqueeze/reshape). An exception to this rule is when the operator is part of the graph,
-        // in which case we always need to compile and execute the operator (although this is something that we
-        // could optimize in the future).
-        if (!contextPrivate->IsDmlGraphNode())
-        {
-            std::vector<uint32_t> outputSizes = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
-            std::vector<int64_t> outputSizesInt64(outputSizes.begin(), outputSizes.end());
-            onnxruntime::TensorShape outputShape(outputSizesInt64);
-            ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputAliasesOutput(0, 0, outputShape, &m_aliasing));
-            ORT_THROW_IF_FAILED(kernelInfo.GetNodeWrapperInterface()->InputSharesOutputBuffer(0, 0, outputShape, &m_inputSharesOutputBuffer));
-        }
-
-        if (contextPrivate->IsDmlGraphNode() || (!m_aliasing && m_inputSharesOutputBuffer))
-        {
-            std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
-            std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+        // Although we always compile the operator because we don't know where the memory will be allocated in the future,
+        // we may not always end up executing it.
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
-            DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {};
-            opDesc.InputTensor = inputDescs.data();
-            opDesc.OutputTensor = outputDescs.data();
+        DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC opDesc = {};
+        opDesc.InputTensor = inputDescs.data();
+        opDesc.OutputTensor = outputDescs.data();
 
-            SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo);
-        }
+        SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_IDENTITY, &opDesc }, kernelInfo);
     }
 
     void Compute(const MLOperatorKernelContext& kernelContext) final
     {
-        // If the input is aliasing the output, we don't need to do anything here
-        if (m_aliasing)
+        MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
+        MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
+
+        // If the input is aliasing the output (i.e. they share the same resource at the same offset),
+        // we don't need to do anything. This is essentially a no-op.
+        if (inputTensor.GetByteData() == outputTensor.GetByteData())
         {
             return;
         }
 
-        // If the input and the output share the same buffer, we need to do an identity operation
-        if (m_inputSharesOutputBuffer)
+        // If the input is not aliasing the output but shares the same resource, we have to use an Identity operation
+        // because the resource cannot simultaneously be in both the COPY_SOURCE and COPY_DEST states.
+        if (inputTensor.GetDataInterface().Get() == outputTensor.GetDataInterface().Get())
         {
             DmlOperator::Compute(kernelContext);
-            return;
         }
-
-        // If the input and the output don't share the same buffer, we can do a standard copy operation instead
-        MLOperatorTensor inputTensor = kernelContext.GetInputTensor(0);
-        MLOperatorTensor outputTensor = kernelContext.GetOutputTensor(0);
-
-        ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
-            outputTensor.GetInterface().Get(),
-            inputTensor.GetInterface().Get()));
+        else
+        {
+            // The input and the output don't share the same resource, so we can do a simple copy.
+            ORT_THROW_IF_FAILED(m_executionProvider->CopyTensor(
+                outputTensor.GetInterface().Get(),
+                inputTensor.GetInterface().Get()));
+        }
     }
 
 private:
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index 5640c9f30283b..9b4536b6218b2 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -111,20 +111,6 @@ IMLOperatorKernelCreationContextNodeWrapperPrivate : public IMLOperatorKernelCre
     STDMETHOD(GetExecutionProvider)(
         _Outptr_result_maybenull_ IUnknown** executionProvider
         ) const noexcept PURE;
-
-    STDMETHOD(InputAliasesOutput)(
-        _In_ uint32_t inputIndex,
-        _In_ uint32_t outputIndex,
-        _In_ const onnxruntime::TensorShape& outputShape,
-        _Out_ bool* aliasing
-    ) noexcept PURE;
-
-    STDMETHOD(InputSharesOutputBuffer)(
-        _In_ uint32_t inputIndex,
-        _In_ uint32_t outputIndex,
-        _In_ const onnxruntime::TensorShape& outputShape,
-        _Out_ bool* sharesBuffer
-    ) noexcept PURE;
 };
 
 //! \interface IMLOperatorAttributes1

From 738efb7dc3b5804ef43ec65faa832585cc5b4234 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Fri, 4 Aug 2023 19:07:58 -0700
Subject: [PATCH 66/76] Fix upload heap regression

---
 .../src/PooledUploadHeap.cpp                  | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp
index 442b3e7ddf746..4a222d183bcfd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.cpp
@@ -118,8 +118,23 @@ namespace Dml
 
     std::pair<PooledUploadHeap::Chunk*, size_t> PooledUploadHeap::Reserve(size_t sizeInBytes)
     {
+        // Try to find a chunk with enough free space to accommodate the requested allocation size
+        for (Chunk& chunk : m_chunks)
+        {
+            std::optional<size_t> offsetForAllocation = FindOffsetForAllocation(chunk, sizeInBytes);
+            if (offsetForAllocation)
+            {
+                // There's enough space in this chunk - return
+                return std::make_pair(&chunk, *offsetForAllocation);
+            }
+        }
+
+        // No chunks were able to accommodate the allocation - create a new chunk and return that instead
+
         // At least double the capacity of the pool
-        m_chunks.push_back(CreateChunk(m_device.Get(), sizeInBytes));
+        const size_t newChunkSize = std::max({ m_totalCapacity, c_minChunkSize, sizeInBytes });
+        m_chunks.push_back(CreateChunk(m_device.Get(), newChunkSize));
+        m_totalCapacity += newChunkSize;
 
         // Allocate from the beginning of the new chunk
         return std::make_pair(&m_chunks.back(), 0);
@@ -197,6 +212,13 @@ namespace Dml
             return c.allocations.empty();
         });
         m_chunks.erase(it, m_chunks.end());
+
+        // Re-calculate total capacity
+        m_totalCapacity = 0;
+        for (const auto& chunk : m_chunks)
+        {
+            m_totalCapacity += chunk.capacityInBytes;
+        }
     }
 
     void PooledUploadHeap::AssertInvariants()
@@ -208,7 +230,7 @@ namespace Dml
         };
 
         // Chunks should be sorted by ascending capacity
-        // assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer));
+        assert(std::is_sorted(m_chunks.begin(), m_chunks.end(), chunkCapacityComparer));
 
         // Allocations in a chunk should be sorted by ascending fence value
         for (const auto& chunk : m_chunks)
@@ -254,6 +276,14 @@ namespace Dml
             }
         }
 
+        // Validate total capacity of pool
+        size_t calculatedCapacity = 0;
+        for (const auto& chunk : m_chunks)
+        {
+            calculatedCapacity += chunk.capacityInBytes;
+        }
+        assert(calculatedCapacity == m_totalCapacity);
+
     #endif // #ifdef _DEBUG
     }
 } // namespace Dml

From f64ed2b5082ff21dfe7246f9055dded0c06a6df0 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Sun, 6 Aug 2023 14:03:09 -0700
Subject: [PATCH 67/76] Address PR comments

---
 onnxruntime/core/framework/utils.cc           |   3 +-
 .../inc/DmlExecutionProvider.h                |   2 +-
 .../src/BucketizedBufferAllocator.cpp         |  14 +-
 .../src/BucketizedBufferAllocator.h           |   4 +-
 .../src/DmlBfcAllocator.h                     |   2 +-
 .../DmlExecutionProvider/src/DmlBuffer.cpp    |  26 +-
 .../dml/DmlExecutionProvider/src/DmlBuffer.h  |  11 +-
 .../src/DmlBufferRegion.cpp                   |  30 +--
 .../src/DmlBufferRegion.h                     |  19 +-
 .../src/DmlExternalGpuAllocator.cpp           |   4 +-
 .../src/DmlExternalGpuAllocator.h             |   2 +-
 .../src/DmlGpuAllocator.cpp                   |  12 +-
 .../src/DmlGpuAllocator.h                     |   4 +-
 .../src/DmlHeapAllocation.h                   |   2 +-
 .../src/DmlReservedResourceSubAllocator.cpp   | 226 ++++++++----------
 .../src/DmlReservedResourceSubAllocator.h     |  44 ++--
 .../src/DmlReservedResourceWrapper.h          |   2 +-
 .../src/DmlTaggedPointer.cpp                  |  34 ++-
 .../src/DmlTaggedPointer.h                    |  25 +-
 19 files changed, 220 insertions(+), 246 deletions(-)

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 9ad0bbdac2305..7e11cba608f4d 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -163,11 +163,12 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
 
 #ifdef USE_DML
   const bool bothValuesOnGPU = copy_info.source_device.Type() == OrtDevice::GPU && copy_info.target_device.Type() == OrtDevice::GPU;
+  const bool sourceIsDmlAlloc = copy_info.source_device.MemType() == OrtDevice::MemType::DEFAULT || copy_info.source_device.MemType() == OrtDevice::MemType::DML_EXTERNAL;
   const bool targetIsInternalAlloc = copy_info.target_device.MemType() == OrtDevice::MemType::DEFAULT;
   const bool bothValuesOnSameDevice = copy_info.source_device.Id() == copy_info.target_device.Id();
 
   // The DML EP supports binding external allocations directly, even if the memory types don't match, as long as they are on the same D3D12 device
-  if (bothValuesOnGPU && targetIsInternalAlloc && bothValuesOnSameDevice) {
+  if (bothValuesOnGPU && sourceIsDmlAlloc && targetIsInternalAlloc && bothValuesOnSameDevice) {
     target_mlvalue = source_mlvalue;
     return Status::OK();
   }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index 755bf60195e2e..9ecfec4139756 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -32,7 +32,7 @@ namespace Dml
         bool enableMetacommands,
         bool enableBfcAllocator);
 
-    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t size_in_bytes);
+    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 675f17e4c28af..f8851c1b87a4f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -58,16 +58,16 @@ namespace Dml
         gsl::index index = static_cast<gsl::index>(ceil(log2(size)));
         assert((1ull << index) >= size); // This must be true unless there were some strange rounding issues
 
-        // The smallest bucket is 2^n bytes large, where n = c_minResourceSizeExponent
-        index = std::max<gsl::index>(index, c_minResourceSizeExponent);
-        index -= c_minResourceSizeExponent;
+        // The smallest bucket is 2^n bytes large, where n = MinResourceSizeExponent
+        index = std::max<gsl::index>(index, MinResourceSizeExponent);
+        index -= MinResourceSizeExponent;
 
         return index;
     }
 
     /*static*/ uint64_t BucketizedBufferAllocator::GetBucketSizeFromIndex(gsl::index index)
     {
-        return (1ull << (index + c_minResourceSizeExponent));
+        return (1ull << (index + MinResourceSizeExponent));
     }
 
     ComPtr<DmlResourceWrapper> BucketizedBufferAllocator::AllocCommittedResource(size_t size)
@@ -93,15 +93,15 @@ namespace Dml
         return static_cast<AllocationInfo*>(opaquePointer);
     }
 
-    D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const
+    D3D12BufferRegion BucketizedBufferAllocator::CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes) const
     {
         auto allocationInfo = static_cast<AllocationInfo*>(opaquePointer);
 
         // Make sure that we are aligned to 4 bytes to satisfy DML's requirements
         constexpr uint64_t DML_ALIGNMENT = 4;
-        size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
+        sizeInBytes = (1 + (sizeInBytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
 
-        return D3D12BufferRegion(0, size_in_bytes, allocationInfo->GetD3D12Resource());
+        return D3D12BufferRegion(0, sizeInBytes, allocationInfo->GetD3D12Resource());
     }
 
     void* BucketizedBufferAllocator::Alloc(size_t size)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 899c8dd44182d..d0b905c45c3c7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -37,7 +37,7 @@ namespace Dml
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
 
         AllocationInfo* GetAllocationInfo(void* opaquePointer);
-        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes) const;
+        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes) const;
         uint64_t GetUniqueId(void* opaquePointer);
 
     public: // onnxruntime::IAllocator
@@ -45,7 +45,7 @@ namespace Dml
         void Free(void* p) final;
 
     private:
-        static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
+        static const uint32_t MinResourceSizeExponent = 16; // 2^16 = 64KB
 
         // The pool consists of a number of buckets, and each bucket contains a number of resources of the same size.
         // The resources in each bucket are always sized as a power of two, and each bucket contains resources twice
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
index 43e093538fcb6..d8631c1e9c1d0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
@@ -21,7 +21,7 @@ namespace Dml
         ),
         m_subAllocator(std::move(subAllocator)) {}
 
-        void* Alloc(size_t size_in_bytes) final { return m_subAllocator->Alloc(size_in_bytes); }
+        void* Alloc(size_t sizeInBytes) final { return m_subAllocator->Alloc(sizeInBytes); }
         void Free(void* ptr) final { m_subAllocator->Free(ptr); }
     private:
         std::shared_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
index 21b5da96ce236..298227b54d947 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.cpp
@@ -8,58 +8,58 @@
 namespace Dml
 {
 
-/*explicit*/ DmlBuffer::DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes)
-    : allocator_(allocator)
+/*explicit*/ DmlBuffer::DmlBuffer(DmlGpuAllocator* allocator, uint64_t sizeInBytes)
+    : m_allocator(allocator)
 {
-    m_opaqueData = allocator_->Alloc(size_in_bytes);
+    m_opaqueData = m_allocator->Alloc(sizeInBytes);
     ORT_THROW_HR_IF(E_OUTOFMEMORY, m_opaqueData == nullptr);
 
-    buffer_region_ = allocator_->CreateBufferRegion(m_opaqueData, size_in_bytes);
+    m_bufferRegion = m_allocator->CreateBufferRegion(m_opaqueData, sizeInBytes);
 }
 
 DmlBuffer::~DmlBuffer()
 {
     if (m_opaqueData != nullptr)
     {
-        allocator_->Free(m_opaqueData);
+        m_allocator->Free(m_opaqueData);
     }
 }
 
 DmlBuffer::DmlBuffer(DmlBuffer&& other) noexcept
 {
     m_opaqueData = other.m_opaqueData;
-    allocator_ = other.allocator_;
-    buffer_region_ = std::move(other.buffer_region_);
+    m_allocator = other.m_allocator;
+    m_bufferRegion = std::move(other.m_bufferRegion);
     other.m_opaqueData = nullptr;
 }
 
 DmlBuffer& DmlBuffer::operator=(DmlBuffer&& other) noexcept
 {
     m_opaqueData = other.m_opaqueData;
-    allocator_ = other.allocator_;
-    buffer_region_ = std::move(other.buffer_region_);
+    m_allocator = other.m_allocator;
+    m_bufferRegion = std::move(other.m_bufferRegion);
     other.m_opaqueData = nullptr;
     return *this;
 }
 
 ID3D12Resource* DmlBuffer::GetD3D12Resource() const
 {
-    return buffer_region_.GetD3D12Resource();
+    return m_bufferRegion.GetD3D12Resource();
 }
 
 uint64_t DmlBuffer::Offset() const
 {
-    return buffer_region_ ? buffer_region_.Offset() : 0;
+    return m_bufferRegion ? m_bufferRegion.Offset() : 0;
 }
 
 uint64_t DmlBuffer::SizeInBytes() const
 {
-    return buffer_region_ ? buffer_region_.SizeInBytes() : 0;
+    return m_bufferRegion ? m_bufferRegion.SizeInBytes() : 0;
 }
 
 DML_BUFFER_BINDING DmlBuffer::GetBufferBinding() const
 {
-    return buffer_region_ ? buffer_region_.GetBufferBinding()
+    return m_bufferRegion ? m_bufferRegion.GetBufferBinding()
                           : DML_BUFFER_BINDING{};
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
index 019d186441da5..e7b570d365a62 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBuffer.h
@@ -18,7 +18,7 @@ class OpKernelContext;
 class DmlBuffer
 {
   public:
-    explicit DmlBuffer(DmlGpuAllocator* allocator, uint64_t size_in_bytes);
+    explicit DmlBuffer(DmlGpuAllocator* allocator, uint64_t sizeInBytes);
     ~DmlBuffer();
 
     // Move-only
@@ -30,15 +30,14 @@ class DmlBuffer
     ID3D12Resource* GetD3D12Resource() const;
     uint64_t Offset() const;
     uint64_t SizeInBytes() const;
-    const D3D12BufferRegion& Region() const { return buffer_region_; }
-
+    const D3D12BufferRegion& Region() const { return m_bufferRegion; }
     DML_BUFFER_BINDING GetBufferBinding() const;
 
-    explicit operator bool() const { return !!buffer_region_; }
+    explicit operator bool() const { return !!m_bufferRegion; }
 
   private:
-    DmlGpuAllocator* allocator_;
-    D3D12BufferRegion buffer_region_;
+    DmlGpuAllocator* m_allocator;
+    D3D12BufferRegion m_bufferRegion;
     void* m_opaqueData;
 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
index 57c4d5b342bb8..627e383a17195 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.cpp
@@ -7,38 +7,38 @@
 namespace Dml
 {
 
-    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t size_in_bytes, ID3D12Resource* resource)
+    D3D12BufferRegion::D3D12BufferRegion(uint64_t offset, uint64_t sizeInBytes, ID3D12Resource* resource)
         : m_resource(resource),
-        offset_(offset),
-        size_in_bytes_(size_in_bytes)
+        m_offset(offset),
+        m_sizeInBytes(sizeInBytes)
     {
         ORT_THROW_HR_IF(E_INVALIDARG, m_resource == nullptr);
 
         // Regions cannot be empty.
-        ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ == 0);
+        ORT_THROW_HR_IF(E_INVALIDARG, m_sizeInBytes == 0);
 
         // Regions cannot extend beyond the size of the resource.
-        uint64_t buffer_size = m_resource->GetDesc().Width;
-        ORT_THROW_HR_IF(E_INVALIDARG, offset_ >= buffer_size);
-        ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes_ > buffer_size - offset);
+        uint64_t bufferSize = m_resource->GetDesc().Width;
+        ORT_THROW_HR_IF(E_INVALIDARG, m_offset >= bufferSize);
+        ORT_THROW_HR_IF(E_INVALIDARG, m_sizeInBytes > bufferSize - offset);
 
         // All three resources, if provided, must be identical aside from state.
         assert(m_resource->GetDesc().Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
-        assert(m_resource->GetDesc().Width == buffer_size);
+        assert(m_resource->GetDesc().Width == bufferSize);
     }
 
     D3D12BufferRegion::D3D12BufferRegion(D3D12BufferRegion&& that) noexcept
     {
         std::swap(this->m_resource, that.m_resource);
-        std::swap(this->offset_, that.offset_);
-        std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->m_offset, that.m_offset);
+        std::swap(this->m_sizeInBytes, that.m_sizeInBytes);
     }
 
     D3D12BufferRegion& D3D12BufferRegion::operator=(D3D12BufferRegion&& that) noexcept
     {
         std::swap(this->m_resource, that.m_resource);
-        std::swap(this->offset_, that.offset_);
-        std::swap(this->size_in_bytes_, that.size_in_bytes_);
+        std::swap(this->m_offset, that.m_offset);
+        std::swap(this->m_sizeInBytes, that.m_sizeInBytes);
         return *this;
     }
 
@@ -49,12 +49,12 @@ namespace Dml
 
     uint64_t D3D12BufferRegion::Offset() const
     {
-        return m_resource ? offset_ : 0;
+        return m_resource ? m_offset : 0;
     }
 
     uint64_t D3D12BufferRegion::SizeInBytes() const
     {
-        return m_resource ? size_in_bytes_ : 0;
+        return m_resource ? m_sizeInBytes : 0;
     }
 
     DML_BUFFER_BINDING D3D12BufferRegion::GetBufferBinding() const
@@ -64,7 +64,7 @@ namespace Dml
             return DML_BUFFER_BINDING{};
         }
 
-        return DML_BUFFER_BINDING{m_resource, offset_, size_in_bytes_};
+        return DML_BUFFER_BINDING{m_resource, m_offset, m_sizeInBytes};
     }
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
index 40c41f980b011..d14ff1b51b3f9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBufferRegion.h
@@ -16,10 +16,7 @@ namespace Dml
         // References a region of a buffer. The respective ID3D12Resource objects
         // must be in the appropriate states. Each resource is optional, but if more
         // than one are provided they must map to the same region of memory.
-        D3D12BufferRegion(
-            uint64_t offset,
-            uint64_t size_in_bytes,
-            ID3D12Resource* resource);
+        D3D12BufferRegion(uint64_t offset, uint64_t sizeInBytes, ID3D12Resource* resource);
 
         // Move-only
         D3D12BufferRegion(const D3D12BufferRegion&) = default;
@@ -37,21 +34,21 @@ namespace Dml
 
         // Creates a subregion at an offset from the start of this region. If no
         // size is provided the region runs to the end of the current region.
-        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t size_in_bytes = 0) const
+        inline D3D12BufferRegion Subregion(uint64_t offset, uint64_t sizeInBytes = 0) const
         {
             // start of subregion must be within current region
-            ORT_THROW_HR_IF(E_INVALIDARG, offset >= size_in_bytes_);
-            size_in_bytes = size_in_bytes == 0 ? size_in_bytes_ - offset : size_in_bytes;
+            ORT_THROW_HR_IF(E_INVALIDARG, offset >= m_sizeInBytes);
+            sizeInBytes = sizeInBytes == 0 ? m_sizeInBytes - offset : sizeInBytes;
             // end of subregion must be within current region
-            ORT_THROW_HR_IF(E_INVALIDARG, size_in_bytes > size_in_bytes_ - offset);
+            ORT_THROW_HR_IF(E_INVALIDARG, sizeInBytes > m_sizeInBytes - offset);
 
-            return D3D12BufferRegion(offset_ + offset, size_in_bytes, m_resource);
+            return D3D12BufferRegion(m_offset + offset, sizeInBytes, m_resource);
         }
 
     private:
         ID3D12Resource* m_resource = nullptr;
-        uint64_t offset_ = 0;
-        uint64_t size_in_bytes_ = 0;
+        uint64_t m_offset = 0;
+        uint64_t m_sizeInBytes = 0;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
index 0cb8e36581672..3882823629854 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.cpp
@@ -36,10 +36,10 @@ namespace Dml
         m_device = onnxruntime::DMLProviderFactoryCreator::CreateD3D12Device(device_id, false);
     }
 
-    void* DmlExternalGpuAllocator::Alloc(size_t size_in_bytes)
+    void* DmlExternalGpuAllocator::Alloc(size_t sizeInBytes)
     {
         Microsoft::WRL::ComPtr<ID3D12Resource> resource;
-        auto buffer = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+        auto buffer = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
         auto props = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT);
         ORT_THROW_IF_FAILED(m_device->CreateCommittedResource(
             &props,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
index 7ac1cc9510b10..3d61bee211949 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalGpuAllocator.h
@@ -17,7 +17,7 @@ namespace Dml
         DmlExternalGpuAllocator(ID3D12Device* device);
         DmlExternalGpuAllocator(int device_id);
 
-        void* Alloc(size_t size_in_bytes) final;
+        void* Alloc(size_t sizeInBytes) final;
         void Free(void* ptr) final;
 
     private:
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
index 881478c3e874f..b0ddbd2b155ff 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.cpp
@@ -31,14 +31,14 @@ namespace Dml
     m_bfcSubAllocator(bfcSubAllocator),
     m_activeAllocator(activeAllocator) {}
 
-    void* DmlGpuAllocator::Alloc(size_t size_in_bytes)
+    void* DmlGpuAllocator::Alloc(size_t sizeInBytes)
     {
         switch(m_activeAllocator)
         {
         case ActiveAllocator::BfcAllocator:
-            return m_bfcAllocator->Alloc(size_in_bytes);
+            return m_bfcAllocator->Alloc(sizeInBytes);
         case ActiveAllocator::BucketizedBufferAllocator:
-            return m_bucketizedBufferAllocator->Alloc(size_in_bytes);
+            return m_bucketizedBufferAllocator->Alloc(sizeInBytes);
         default:
             ORT_THROW_HR(E_UNEXPECTED);
         }
@@ -57,14 +57,14 @@ namespace Dml
         }
     }
 
-    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes)
+    D3D12BufferRegion DmlGpuAllocator::CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes)
     {
         switch(m_activeAllocator)
         {
         case ActiveAllocator::BfcAllocator:
-            return m_bfcSubAllocator->CreateBufferRegion(opaquePointer, size_in_bytes);
+            return m_bfcSubAllocator->CreateBufferRegion(opaquePointer, sizeInBytes);
         case ActiveAllocator::BucketizedBufferAllocator:
-            return m_bucketizedBufferAllocator->CreateBufferRegion(opaquePointer, size_in_bytes);
+            return m_bucketizedBufferAllocator->CreateBufferRegion(opaquePointer, sizeInBytes);
         default:
             ORT_THROW_HR(E_UNEXPECTED);
         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
index e8b020a85767b..dda5f1984da69 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGpuAllocator.h
@@ -29,9 +29,9 @@ namespace Dml
             std::shared_ptr<DmlReservedResourceSubAllocator> bfcSubAllocator,
             ActiveAllocator activeAllocator);
 
-        void* Alloc(size_t size_in_bytes) final;
+        void* Alloc(size_t sizeInBytes) final;
         void Free(void* ptr) final;
-        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes);
         AllocationInfo* GetAllocationInfo(void* opaquePointer);
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
         DmlBuffer AllocateDefaultBuffer(uint64_t num_bytes);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
index ab75b7d322120..5ecf135a9ee43 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlHeapAllocation.h
@@ -13,6 +13,6 @@ namespace Dml
         // an allocation may comprise multiple heaps. If tiling is not supported
         // an allocation will only have a single heap.
         std::vector<Microsoft::WRL::ComPtr<ID3D12Heap>> heaps;
-        Microsoft::WRL::ComPtr<ID3D12Resource> resource_uav_state;
+        Microsoft::WRL::ComPtr<ID3D12Resource> resourceUavState;
     };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
index 4fdd6411d555a..cb58c30283e95 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.cpp
@@ -33,16 +33,16 @@ namespace Dml
         gsl::index index = static_cast<gsl::index>(ceil(log2(size)));
         assert((1ull << index) >= size); // This must be true unless there were some strange rounding issues
 
-        // The smallest bucket is 2^n bytes large, where n = c_minResourceSizeExponent
-        index = std::max<gsl::index>(index, c_minResourceSizeExponent);
-        index -= c_minResourceSizeExponent;
+        // The smallest bucket is 2^n bytes large, where n = MinResourceSizeExponent
+        index = std::max<gsl::index>(index, MinResourceSizeExponent);
+        index -= MinResourceSizeExponent;
 
         return index;
     }
 
     /*static*/ uint64_t DmlReservedResourceSubAllocator::GetBucketSizeFromIndex(gsl::index index)
     {
-        return (1ull << (index + c_minResourceSizeExponent));
+        return (1ull << (index + MinResourceSizeExponent));
     }
 
     void DmlReservedResourceSubAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
@@ -53,10 +53,7 @@ namespace Dml
     static bool GetTilingEnabled(ID3D12Device* device)
     {
         D3D12_FEATURE_DATA_D3D12_OPTIONS options = {};
-        if (SUCCEEDED(device->CheckFeatureSupport(
-                D3D12_FEATURE_D3D12_OPTIONS,
-                &options,
-                sizeof(options))))
+        if (SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &options, sizeof(options))))
         {
             return options.TiledResourcesTier >= D3D12_TILED_RESOURCES_TIER_1;
         }
@@ -66,91 +63,86 @@ namespace Dml
 
     static uint64_t GetMaxHeapSizeInTiles()
     {
-        return DmlReservedResourceSubAllocator::kDefaultMaxHeapSizeInTiles;
+        return DmlReservedResourceSubAllocator::DefaultMaxHeapSizeInTiles;
     }
 
     DmlReservedResourceSubAllocator::DmlReservedResourceSubAllocator(
         ID3D12Device* device,
         std::shared_ptr<ExecutionContext> context,
         ID3D12CommandQueue* queue,
-        const D3D12_HEAP_PROPERTIES& heap_props,
-        D3D12_HEAP_FLAGS heap_flags,
-        D3D12_RESOURCE_FLAGS resource_flags,
-        D3D12_RESOURCE_STATES initial_state)
+        const D3D12_HEAP_PROPERTIES& heapProps,
+        D3D12_HEAP_FLAGS heapFlags,
+        D3D12_RESOURCE_FLAGS resourceFlags,
+        D3D12_RESOURCE_STATES initialState)
         : m_device(device),
         m_context(context),
-        queue_(queue),
-        heap_properties_(heap_props),
-        heap_flags_(heap_flags),
-        resource_flags_(resource_flags),
-        initial_state_(initial_state),
-        tiling_enabled_(GetTilingEnabled(device)),
-        max_heap_size_in_tiles_(GetMaxHeapSizeInTiles())
+        m_queue(queue),
+        m_heapProperties(heapProps),
+        m_heapFlags(heapFlags),
+        m_resourceFlags(resourceFlags),
+        m_initialState(initialState),
+        m_tilingEnabled(GetTilingEnabled(device)),
+        m_maxHeapSizeInTiles(GetMaxHeapSizeInTiles())
     {
     }
 
-    absl::optional<DmlHeapAllocation> DmlReservedResourceSubAllocator::TryCreateTiledAllocation(uint64_t size_in_bytes)
+    absl::optional<DmlHeapAllocation> DmlReservedResourceSubAllocator::TryCreateTiledAllocation(uint64_t sizeInBytes)
     {
         DmlHeapAllocation allocation = {};
 
         // The allocation may be larger than the requested size to ensure a whole
         // number of tiles.
-        const uint64_t resource_size_in_tiles = 1 + (size_in_bytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-        const uint64_t resource_size_in_bytes = resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-        auto resource_desc = CD3DX12_RESOURCE_DESC::Buffer(resource_size_in_bytes, resource_flags_);
+        const uint64_t resourceSizeInTiles = 1 + (sizeInBytes - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        const uint64_t resourceSizeInBytes = resourceSizeInTiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        auto resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(resourceSizeInBytes, m_resourceFlags);
 
-        HRESULT create_resource_hr = m_device->CreateReservedResource(
-            &resource_desc,
-            initial_state_,
+        HRESULT createResourceHr = m_device->CreateReservedResource(
+            &resourceDesc,
+            m_initialState,
             nullptr,
-            IID_PPV_ARGS(&allocation.resource_uav_state));
+            IID_PPV_ARGS(&allocation.resourceUavState));
 
-        if (create_resource_hr == E_OUTOFMEMORY)
+        if (createResourceHr == E_OUTOFMEMORY)
         {
             return absl::nullopt;
         }
-        ORT_THROW_IF_FAILED(create_resource_hr);
+        ORT_THROW_IF_FAILED(createResourceHr);
 
         // Reserve enough heaps to store all tiles in the resource.
-        const uint64_t heap_count = 1 + (resource_size_in_tiles - 1) / max_heap_size_in_tiles_;
-        allocation.heaps.resize(heap_count);
+        const uint64_t heapCount = 1 + (resourceSizeInTiles - 1) / m_maxHeapSizeInTiles;
+        allocation.heaps.resize(heapCount);
 
         // Create heaps and map them to the primary reserved resource.
-        D3D12_TILED_RESOURCE_COORDINATE resource_region_start_coordinates = {};
-        uint64_t unmapped_resource_tiles = resource_size_in_tiles;
-        for (uint64_t i = 0; i < heap_count; i++)
+        D3D12_TILED_RESOURCE_COORDINATE resourceRegionStartCoordinates = {};
+        uint64_t unmappedResourceTiles = resourceSizeInTiles;
+        for (uint64_t i = 0; i < heapCount; i++)
         {
             // Create heap. The last heap of the allocation may have fewer tiles to
             // avoid wasting space.
-            uint64_t heap_size_in_tiles = std::min<uint64_t>(
-                unmapped_resource_tiles,
-                max_heap_size_in_tiles_);
-            uint64_t heap_size_in_bytes =
-                heap_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+            uint64_t heapSizeInTiles = std::min<uint64_t>(unmappedResourceTiles, m_maxHeapSizeInTiles);
+            uint64_t heapSizeInBytes = heapSizeInTiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
             auto heap_desc = CD3DX12_HEAP_DESC(
-                heap_size_in_bytes,
-                heap_properties_,
+                heapSizeInBytes,
+                m_heapProperties,
                 0,
-                heap_flags_);
+                m_heapFlags);
 
-            HRESULT create_heap_hr =
-                m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
-            if (create_heap_hr == E_OUTOFMEMORY)
+            HRESULT createHeapHr = m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps[i]));
+            if (createHeapHr == E_OUTOFMEMORY)
             {
                 return absl::nullopt;
             }
-            ORT_THROW_IF_FAILED(create_heap_hr);
+            ORT_THROW_IF_FAILED(createHeapHr);
 
             // Source region in the resource to map.
-            D3D12_TILE_REGION_SIZE resource_region_size = {};
-            resource_region_size.NumTiles = static_cast<uint32_t>(heap_size_in_tiles);
+            D3D12_TILE_REGION_SIZE resourceRegionSize = {};
+            resourceRegionSize.NumTiles = static_cast<uint32_t>(heapSizeInTiles);
 
             // Target range in the current heap to map.
-            const D3D12_TILE_RANGE_FLAGS tile_range_flags =
-                D3D12_TILE_RANGE_FLAG_NONE;
-            const uint32_t heap_range_tile_count = static_cast<uint32_t>(heap_size_in_tiles);
+            constexpr D3D12_TILE_RANGE_FLAGS tileRangeFlags = D3D12_TILE_RANGE_FLAG_NONE;
+            const uint32_t heapRangeTileCount = static_cast<uint32_t>(heapSizeInTiles);
 
-            constexpr uint32_t heap_range_start_offset = 0;
+            constexpr uint32_t heapRangeStartOffset = 0;
             constexpr uint32_t numResourceRegions = 1;
             constexpr uint32_t numHeapRanges = 1;
 
@@ -158,88 +150,83 @@ namespace Dml
             // guaranteed to be set (on the GPU timeline) by the time any code can
             // reference the returned resource. We only execute operations on a
             // single hardware queue so there is no need to wait or signal.
-            queue_->UpdateTileMappings(
-                allocation.resource_uav_state.Get(),
+            m_queue->UpdateTileMappings(
+                allocation.resourceUavState.Get(),
                 numResourceRegions,
-                &resource_region_start_coordinates,
-                &resource_region_size,
+                &resourceRegionStartCoordinates,
+                &resourceRegionSize,
                 allocation.heaps[i].Get(),
                 numHeapRanges,
-                &tile_range_flags,
-                &heap_range_start_offset,
-                &heap_range_tile_count,
+                &tileRangeFlags,
+                &heapRangeStartOffset,
+                &heapRangeTileCount,
                 D3D12_TILE_MAPPING_FLAG_NONE);
 
-            resource_region_start_coordinates.X += static_cast<uint32_t>(heap_size_in_tiles);
-            unmapped_resource_tiles -= heap_size_in_tiles;
+            resourceRegionStartCoordinates.X += static_cast<uint32_t>(heapSizeInTiles);
+            unmappedResourceTiles -= heapSizeInTiles;
         }
 
-        assert(unmapped_resource_tiles == 0);
+        assert(unmappedResourceTiles == 0);
 
         return allocation;
     }
 
-    absl::optional<DmlHeapAllocation> DmlReservedResourceSubAllocator::TryCreateUntiledAllocation(uint64_t size_in_bytes)
+    absl::optional<DmlHeapAllocation> DmlReservedResourceSubAllocator::TryCreateUntiledAllocation(uint64_t sizeInBytes)
     {
         DmlHeapAllocation allocation = {};
 
         // Create the allocation's sole heap. The allocation may be larger than the
         // requested size to ensure a whole number of tiles.
         allocation.heaps.resize(1);
-        D3D12_HEAP_DESC heap_desc =
-            CD3DX12_HEAP_DESC(size_in_bytes, heap_properties_, 0, heap_flags_);
-        HRESULT create_heap_hr = m_device->CreateHeap(
-            &heap_desc,
-            IID_PPV_ARGS(&allocation.heaps.front()));
-        if (create_heap_hr == E_OUTOFMEMORY)
+        D3D12_HEAP_DESC heap_desc = CD3DX12_HEAP_DESC(sizeInBytes, m_heapProperties, 0, m_heapFlags);
+        HRESULT createHeapHr = m_device->CreateHeap(&heap_desc, IID_PPV_ARGS(&allocation.heaps.front()));
+        if (createHeapHr == E_OUTOFMEMORY)
         {
             return absl::nullopt;
         }
+        ORT_THROW_IF_FAILED(createHeapHr);
 
         // Create large placed resource that spans the heap.
-        D3D12_RESOURCE_DESC resource_desc = CD3DX12_RESOURCE_DESC::Buffer(size_in_bytes, resource_flags_);
+        D3D12_RESOURCE_DESC resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, m_resourceFlags);
 
-        HRESULT create_resource_hr = m_device->CreatePlacedResource(
+        HRESULT createResourceHr = m_device->CreatePlacedResource(
             allocation.heaps.front().Get(),
             0,
-            &resource_desc,
-            initial_state_,
+            &resourceDesc,
+            m_initialState,
             nullptr,
-            IID_PPV_ARGS(&allocation.resource_uav_state));
-        if (create_resource_hr == E_OUTOFMEMORY)
+            IID_PPV_ARGS(&allocation.resourceUavState));
+        if (createResourceHr == E_OUTOFMEMORY)
         {
             return absl::nullopt;
         }
-        ORT_THROW_IF_FAILED(create_resource_hr);
+        ORT_THROW_IF_FAILED(createResourceHr);
 
         return allocation;
     }
 
     uint64_t DmlReservedResourceSubAllocator::ComputeRequiredSize(size_t size)
     {
-        const uint64_t resource_size_in_tiles =
-            1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-        const uint64_t resource_size_in_bytes =
-            resource_size_in_tiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-
-        return resource_size_in_bytes;
+        const uint64_t resourceSizeInTiles = 1 + (size - 1) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        const uint64_t resourceSizeInBytes = resourceSizeInTiles * D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+        return resourceSizeInBytes;
     }
 
-    void* DmlReservedResourceSubAllocator::Alloc(size_t size_in_bytes)
+    void* DmlReservedResourceSubAllocator::Alloc(size_t sizeInBytes)
     {
         // For some reason lotus likes requesting 0 bytes of memory
-        size_in_bytes = std::max<size_t>(1, size_in_bytes);
+        sizeInBytes = std::max<size_t>(1, sizeInBytes);
 
         // The D3D12 device is thread-safe so we don't need to hold the lock while
         // creating an allocation.
         absl::optional<DmlHeapAllocation> allocation =
-            tiling_enabled_ ? TryCreateTiledAllocation(size_in_bytes)
-                            : TryCreateUntiledAllocation(size_in_bytes);
+            m_tilingEnabled ? TryCreateTiledAllocation(sizeInBytes)
+                            : TryCreateUntiledAllocation(sizeInBytes);
 
         ORT_THROW_HR_IF(E_INVALIDARG, !allocation);
 
         // We need to access (mutable) state after this point, so we need to lock
-        std::unique_lock<std::mutex> lock(mutex_);
+        std::unique_lock<std::mutex> lock(m_mutex);
 
         absl::optional<uint32_t> allocationId = TryReserveAllocationID();
         ORT_THROW_HR_IF(E_INVALIDARG, !allocationId);
@@ -247,13 +234,13 @@ namespace Dml
         auto resourceWrapper = wil::MakeOrThrow<DmlReservedResourceWrapper>(std::move(*allocation));
         ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
             this,
-            ++m_currentAllocationId,
+            ++m_currentUniqueAllocationId,
             0,
             resourceWrapper.Get(),
-            size_in_bytes
+            sizeInBytes
         );
 
-        allocations_by_id_.emplace(*allocationId, allocInfo);
+        m_allocationsById.emplace(*allocationId, allocInfo);
 
         lock.unlock();
 
@@ -262,28 +249,28 @@ namespace Dml
     #endif
 
         // DML only has a single device in ORT at the moment
-        constexpr uint64_t device_id = 0;
+        constexpr uint64_t deviceId = 0;
         constexpr uint64_t offset = 0;
-        return TaggedPointer::Pack(device_id, *allocationId, offset);
+        return TaggedPointer::Pack(deviceId, *allocationId, offset);
     }
 
     void DmlReservedResourceSubAllocator::Free(void* ptr)
     {
         ORT_THROW_HR_IF(E_INVALIDARG, ptr == nullptr);
 
-        TaggedPointer tagged_ptr = TaggedPointer::Unpack(ptr);
-        ORT_THROW_HR_IF(E_INVALIDARG, tagged_ptr.offset != 0);
+        TaggedPointer taggedPtr = TaggedPointer::Unpack(ptr);
+        ORT_THROW_HR_IF(E_INVALIDARG, taggedPtr.offset != 0);
 
         // We need to access (mutable) state after this point, so we need to lock
-        std::unique_lock<std::mutex> lock(mutex_);
+        std::unique_lock<std::mutex> lock(m_mutex);
 
-        auto it = allocations_by_id_.find(tagged_ptr.allocation_id);
-        ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
+        auto it = m_allocationsById.find(taggedPtr.allocationId);
+        ORT_THROW_HR_IF(E_INVALIDARG, it == m_allocationsById.end());
 
-        ReleaseAllocationID(tagged_ptr.allocation_id);
+        ReleaseAllocationID(taggedPtr.allocationId);
 
         // Frees the ID3D12Heap
-        allocations_by_id_.erase(it);
+        m_allocationsById.erase(it);
     }
 
     uint64_t DmlReservedResourceSubAllocator::GetUniqueId(void* opaquePointer)
@@ -313,60 +300,59 @@ namespace Dml
     absl::optional<uint32_t> DmlReservedResourceSubAllocator::TryReserveAllocationID()
     {
         // The mutex must already be held
-        assert(!mutex_.try_lock());
+        assert(!m_mutex.try_lock());
 
-        if (!free_allocation_ids_.empty())
+        if (!m_freeAllocationIds.empty())
         {
             // Return a free ID from the pool
-            uint32_t id = free_allocation_ids_.back();
-            free_allocation_ids_.pop_back();
+            uint32_t id = m_freeAllocationIds.back();
+            m_freeAllocationIds.pop_back();
             return id;
         }
 
-        static constexpr uint32_t kMaxAllocationID =
-            (1 << TaggedPointer::kAllocationIDBits) - 1;
-        if (current_allocation_id_ == kMaxAllocationID)
+        static constexpr uint32_t maxAllocationID = (1 << TaggedPointer::AllocationIDBits) - 1;
+        if (m_currentAllocationId == maxAllocationID)
         {
             // We've reached the maximum number of allocations!
             return absl::nullopt;
         }
 
-        ++current_allocation_id_;
-        return current_allocation_id_;
+        ++m_currentAllocationId;
+        return m_currentAllocationId;
     }
 
     void DmlReservedResourceSubAllocator::ReleaseAllocationID(uint32_t id)
     {
         // The mutex must already be held
-        assert(!mutex_.try_lock());
+        assert(!m_mutex.try_lock());
 
         // Add it to the pool of free IDs
-        free_allocation_ids_.push_back(id);
+        m_freeAllocationIds.push_back(id);
     }
 
     D3D12BufferRegion DmlReservedResourceSubAllocator::CreateBufferRegion(
         void* opaquePointer,
-        uint64_t size_in_bytes)
+        uint64_t sizeInBytes)
     {
         auto taggedPointer = TaggedPointer::Unpack(opaquePointer);
 
         // We need to access (mutable) state after this point, so we need to lock
-        std::unique_lock<std::mutex> lock(mutex_);
+        std::unique_lock<std::mutex> lock(m_mutex);
 
         // Find the allocation corresponding to this pointer
-        auto it = allocations_by_id_.find(taggedPointer.allocation_id);
-        ORT_THROW_HR_IF(E_INVALIDARG, it == allocations_by_id_.end());
+        auto it = m_allocationsById.find(taggedPointer.allocationId);
+        ORT_THROW_HR_IF(E_INVALIDARG, it == m_allocationsById.end());
 
         // Make sure that we are aligned to 4 bytes to satisfy DML's requirements
         constexpr uint64_t DML_ALIGNMENT = 4;
-        size_in_bytes = (1 + (size_in_bytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
+        sizeInBytes = (1 + (sizeInBytes - 1) / DML_ALIGNMENT) * DML_ALIGNMENT;
 
         // Make sure the region we're trying to create fits entirely in the resource
-        assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + size_in_bytes);
+        assert(it->second->GetD3D12Resource()->GetDesc().Width >= taggedPointer.offset + sizeInBytes);
 
         return D3D12BufferRegion(
             taggedPointer.offset,
-            size_in_bytes,
+            sizeInBytes,
             it->second->GetD3D12Resource());
     }
 
@@ -375,10 +361,10 @@ namespace Dml
         auto taggedPointer = TaggedPointer::Unpack(opaquePointer);
 
         // We need to access (mutable) state after this point, so we need to lock
-        std::unique_lock<std::mutex> lock(mutex_);
+        std::unique_lock<std::mutex> lock(m_mutex);
 
         // Find the allocation corresponding to this pointer
-        auto it = allocations_by_id_.find(taggedPointer.allocation_id);
+        auto it = m_allocationsById.find(taggedPointer.allocationId);
         return it->second.Get();
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index 3f2f1c9210c64..62b1f5b113ae4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -43,16 +43,16 @@ namespace Dml
         // Maximum size of a heap (in tiles) when allocations are tiled. Each tile
         // is 64KB. A default size of 512 tiles (32MB) does a good job of handling
         // local video memory fragmentation without requiring lots of heaps.
-        static constexpr uint64_t kDefaultMaxHeapSizeInTiles = 512;
+        static constexpr uint64_t DefaultMaxHeapSizeInTiles = 512;
 
         DmlReservedResourceSubAllocator(
             ID3D12Device* device,
             std::shared_ptr<ExecutionContext> context,
             ID3D12CommandQueue* queue,
-            const D3D12_HEAP_PROPERTIES& heap_props,
-            D3D12_HEAP_FLAGS heap_flags,
-            D3D12_RESOURCE_FLAGS resource_flags,
-            D3D12_RESOURCE_STATES initial_state);
+            const D3D12_HEAP_PROPERTIES& heapProps,
+            D3D12_HEAP_FLAGS heapFlags,
+            D3D12_RESOURCE_FLAGS resourceFlags,
+            D3D12_RESOURCE_STATES initialState);
 
         // Creates a reserved or placed resource buffer over the given memory range.
         // The physical D3D12 resource may be larger than the requested size, so
@@ -61,13 +61,13 @@ namespace Dml
         // the ID3D12Resource is cached, so this call typically has a lower cost
         // than a call to ID3D12Device::CreatePlacedResource or
         // CreateReservedResource.
-        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t size_in_bytes);
+        D3D12BufferRegion CreateBufferRegion(void* opaquePointer, uint64_t sizeInBytes);
 
         AllocationInfo* GetAllocationInfo(void* opaquePointer);
 
         void FreeResource(AllocationInfo* allocInfo, uint64_t resourceId) final;
         uint64_t ComputeRequiredSize(size_t size);
-        bool TilingEnabled() const { return tiling_enabled_; };
+        bool TilingEnabled() const { return m_tilingEnabled; };
         uint64_t GetUniqueId(void* opaquePointer);
 
         ~DmlReservedResourceSubAllocator();
@@ -84,7 +84,7 @@ namespace Dml
         void Free(void* p);
 
     private:
-        static const uint32_t c_minResourceSizeExponent = 16; // 2^16 = 64KB
+        static constexpr uint32_t MinResourceSizeExponent = 16; // 2^16 = 64KB
 
         // The pool consists of a number of buckets, and each bucket contains a number of resources of the same size.
         // The resources in each bucket are always sized as a power of two, and each bucket contains resources twice
@@ -106,7 +106,7 @@ namespace Dml
         friend class AllocationInfo;
 
         std::vector<Bucket> m_pool;
-        size_t m_currentAllocationId = 0;
+        size_t m_currentUniqueAllocationId = 0;
         uint64_t m_currentResourceId = 0;
         std::unique_ptr<DmlReservedResourceSubAllocator> m_subAllocator;
 
@@ -115,35 +115,35 @@ namespace Dml
         std::map<size_t, AllocationInfo*> m_outstandingAllocationsById;
     #endif
 
-        std::mutex mutex_;
+        std::mutex m_mutex;
 
         Microsoft::WRL::ComPtr<ID3D12Device> m_device;
         std::shared_ptr<ExecutionContext> m_context;
-        Microsoft::WRL::ComPtr<ID3D12CommandQueue> queue_;
-        const D3D12_HEAP_PROPERTIES heap_properties_;
-        const D3D12_HEAP_FLAGS heap_flags_;
-        const D3D12_RESOURCE_FLAGS resource_flags_;
-        const D3D12_RESOURCE_STATES initial_state_;
-        bool tiling_enabled_;
-        uint64_t max_heap_size_in_tiles_;
+        Microsoft::WRL::ComPtr<ID3D12CommandQueue> m_queue;
+        const D3D12_HEAP_PROPERTIES m_heapProperties;
+        const D3D12_HEAP_FLAGS m_heapFlags;
+        const D3D12_RESOURCE_FLAGS m_resourceFlags;
+        const D3D12_RESOURCE_STATES m_initialState;
+        bool m_tilingEnabled;
+        uint64_t m_maxHeapSizeInTiles;
 
         // The largest allocation ID we've returned so far (or 0 if we've never done
         // so). Note that our allocation IDs start at 1 (not 0) to ensure that it
         // isn't possible for a valid allocation to have a pointer value of
         // 0x00000000.
-        uint32_t current_allocation_id_ = 0;
+        uint32_t m_currentAllocationId = 0;
 
         // A list of unused allocation IDs. This is for re-use of IDs once they get
         // freed. We only bump the max_allocation_id_ once there are no more free
         // IDs.
-        std::vector<uint32_t> free_allocation_ids_;
+        std::vector<uint32_t> m_freeAllocationIds;
 
-        absl::optional<DmlHeapAllocation> TryCreateTiledAllocation(uint64_t size_in_bytes);
-        absl::optional<DmlHeapAllocation> TryCreateUntiledAllocation(uint64_t size_in_bytes);
+        absl::optional<DmlHeapAllocation> TryCreateTiledAllocation(uint64_t sizeInBytes);
+        absl::optional<DmlHeapAllocation> TryCreateUntiledAllocation(uint64_t sizeInBytes);
 
         friend class D3D12BufferRegion;
 
-        absl::flat_hash_map<uint32_t, Microsoft::WRL::ComPtr<AllocationInfo>> allocations_by_id_;
+        absl::flat_hash_map<uint32_t, Microsoft::WRL::ComPtr<AllocationInfo>> m_allocationsById;
 
         // Retrieves a free allocation ID, or nullopt if no more IDs are available.
         absl::optional<uint32_t> TryReserveAllocationID();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
index de42157645bba..e278ecbeb7415 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceWrapper.h
@@ -17,7 +17,7 @@ namespace Dml
         {
         }
 
-        ID3D12Resource* GetD3D12Resource() const final { return m_allocation.resource_uav_state.Get(); }
+        ID3D12Resource* GetD3D12Resource() const final { return m_allocation.resourceUavState.Get(); }
 
     private:
         DmlHeapAllocation m_allocation;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
index 8f503566768a1..f823d05c45382 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.cpp
@@ -8,40 +8,36 @@ namespace Dml
 {
 /*static*/ TaggedPointer TaggedPointer::Unpack(const void* ptr)
 {
-    uint64_t ptr_val = reinterpret_cast<uint64_t>(ptr);
+    uint64_t ptrVal = reinterpret_cast<uint64_t>(ptr);
 
-    static constexpr uint64_t kAllocationIDMask =
-        (1ull << kAllocationIDBits) - 1;
-    static constexpr uint64_t kOffsetMask = (1ull << kOffsetBits) - 1;
+    static constexpr uint64_t allocationIDMask = (1ull << AllocationIDBits) - 1;
+    static constexpr uint64_t offsetMask = (1ull << OffsetBits) - 1;
 
-    TaggedPointer tagged_ptr;
-    tagged_ptr.device_id = (ptr_val >> (kAllocationIDBits + kOffsetBits));
-    tagged_ptr.allocation_id = (ptr_val >> kOffsetBits) & kAllocationIDMask;
-    tagged_ptr.offset = (ptr_val & kOffsetMask);
+    TaggedPointer taggedPtr;
+    taggedPtr.deviceId = (ptrVal >> (AllocationIDBits + OffsetBits));
+    taggedPtr.allocationId = (ptrVal >> OffsetBits) & allocationIDMask;
+    taggedPtr.offset = (ptrVal & offsetMask);
 
-    return tagged_ptr;
+    return taggedPtr;
 }
 
-/*static*/ void* TaggedPointer::Pack(
-    uint32_t device_id,
-    uint32_t allocation_id,
-    uint64_t offset)
+/*static*/ void* TaggedPointer::Pack(uint32_t deviceId, uint32_t allocationId, uint64_t offset)
 {
-    assert(device_id < (1ull << kDeviceIDBits));
-    assert(allocation_id < (1ull << kAllocationIDBits));
-    assert(offset < (1ull << kOffsetBits));
+    assert(deviceId < (1ull << DeviceIDBits));
+    assert(allocationId < (1ull << AllocationIDBits));
+    assert(offset < (1ull << OffsetBits));
 
     // Store the device ID in the upper bits of the pointer, followed by the
     // allocation id and the offset in the lower bits
-    uint64_t ptr = ((uint64_t)device_id << (kAllocationIDBits + kOffsetBits)) |
-                   ((uint64_t)allocation_id << kOffsetBits) | offset;
+    uint64_t ptr = ((uint64_t)deviceId << (AllocationIDBits + OffsetBits)) |
+                   ((uint64_t)allocationId << OffsetBits) | offset;
 
     return reinterpret_cast<void*>(ptr);
 }
 
 uint64_t TaggedPointer::GetUniqueId() const
 {
-    return reinterpret_cast<uint64_t>(TaggedPointer::Pack(device_id, allocation_id, offset));
+    return reinterpret_cast<uint64_t>(TaggedPointer::Pack(deviceId, allocationId, offset));
 }
 
 } // namespace tfdml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
index ee58e23a6396f..d49e9d92eeb82 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlTaggedPointer.h
@@ -16,18 +16,15 @@ namespace Dml
 // must be done using masks and shifts.
 struct TaggedPointer
 {
-    static constexpr uint64_t kDeviceIDBits = 4;
-    static constexpr uint64_t kAllocationIDBits = 20;
-    static constexpr uint64_t kOffsetBits = 40;
-
-    uint64_t device_id : kDeviceIDBits;
-    uint64_t allocation_id : kAllocationIDBits;
-    uint64_t offset : kOffsetBits;
-
-    static void* Pack(
-        uint32_t device_id,
-        uint32_t allocation_id,
-        uint64_t offset);
+    static constexpr uint64_t DeviceIDBits = 4;
+    static constexpr uint64_t AllocationIDBits = 20;
+    static constexpr uint64_t OffsetBits = 40;
+
+    uint64_t deviceId : DeviceIDBits;
+    uint64_t allocationId : AllocationIDBits;
+    uint64_t offset : OffsetBits;
+
+    static void* Pack(uint32_t deviceId, uint32_t allocationId, uint64_t offset);
     static TaggedPointer Unpack(const void* ptr);
     uint64_t GetUniqueId() const;
 };
@@ -36,9 +33,7 @@ static_assert(
     sizeof(TaggedPointer) == sizeof(void*),
     "DML requires a 64-bit architecture");
 static_assert(
-    TaggedPointer::kDeviceIDBits + TaggedPointer::kAllocationIDBits +
-            TaggedPointer::kOffsetBits ==
-        sizeof(void*) * CHAR_BIT,
+    TaggedPointer::DeviceIDBits + TaggedPointer::AllocationIDBits + TaggedPointer::OffsetBits == sizeof(void*) * CHAR_BIT,
     "DML requires a 64-bit architecture");
 
 } // namespace tfdml

From 216fc395c045bf1f38430fc860aae0bb1efe3202 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Sun, 6 Aug 2023 14:04:52 -0700
Subject: [PATCH 68/76] Fix indentation

---
 .../src/ExecutionContext.cpp                  | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index 86f964651b638..9a8ad4b4e6745 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -46,16 +46,17 @@ namespace Dml
             D3D12_HEAP_PROPERTIES heapProperties = {
                 D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
 
-            D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER,
-                                                0,
-                                                byteCount,
-                                                1,
-                                                1,
-                                                1,
-                                                DXGI_FORMAT_UNKNOWN,
-                                                {1, 0},
-                                                D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
-                                                D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
+            D3D12_RESOURCE_DESC resourceDesc = {
+                D3D12_RESOURCE_DIMENSION_BUFFER,
+                0,
+                byteCount,
+                1,
+                1,
+                1,
+                DXGI_FORMAT_UNKNOWN,
+                {1, 0},
+                D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
 
             ComPtr<ID3D12Resource> intermediateBuffer;
             ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommittedResource(

From a25b40cfe7adf67e73d68d3b6711e51f2ceb3e94 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 7 Aug 2023 17:09:58 -0700
Subject: [PATCH 69/76] WIP

---
 .../core/providers/dml/dml_provider_factory.h |  12 --
 .../inc/DmlExecutionProvider.h                |   1 -
 .../src/ExecutionProvider.cpp                 |   6 -
 .../providers/dml/dml_provider_factory.cc     |  35 -----
 .../Api.Image/TensorToVideoFrameConverter.cpp | 109 ++++++-------
 .../Api.Image/VideoFrameToTensorConverter.cpp |  53 +++----
 .../inc/TensorToVideoFrameConverter.h         |  19 +--
 .../inc/VideoFrameToTensorConverter.h         |   5 +-
 winml/lib/Api.Ort/OnnxruntimeEngine.cpp       |  26 ++-
 winml/lib/Api.Ort/OnnxruntimeEngine.h         |   2 +-
 winml/lib/Api/ImageFeatureValue.cpp           | 148 +++++++++---------
 winml/lib/Api/ImageFeatureValue.h             |   2 +-
 winml/lib/Api/impl/TensorBase.h               |  86 +++++-----
 winml/lib/Common/inc/iengine.h                |   2 +-
 winml/test/common/SqueezeNetValidator.cpp     |  34 ++--
 .../cppwinrt/scenariotestscppwinrt.cpp        |   1 -
 16 files changed, 223 insertions(+), 318 deletions(-)

diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
index 2ec3a10b08aed..0782d2d9ed760 100644
--- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h
+++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
@@ -99,18 +99,6 @@ struct OrtDmlApi {
    * This API gets the D3D12 resource when an OrtValue has been allocated by the DML EP.
    */
   ORT_API2_STATUS(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* provider, _In_ void* dml_resource, _Out_ ID3D12Resource** d3d_resource);
-
-  /**
-   * GetD3D12ResourceRegionFromAllocation
-   * This API gets the region of a D3D12 resource at a given offset when an OrtValue has been allocated by the DML EP.
-   * Note: Only the subregion of the resource delimited by `offset` and `offset + size_in_bytes` should be accessed
-   */
-  ORT_API2_STATUS(GetD3D12ResourceRegionFromAllocation,
-                  _In_ OrtAllocator* provider,
-                  _In_ void* dml_resource,
-                  _In_ uint64_t size_in_bytes,
-                  _Out_ ID3D12Resource** d3d_resource,
-                  _Out_ uint64_t* offset);
 };
 
 #ifdef __cplusplus
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index 9ecfec4139756..decf15b194d64 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -32,7 +32,6 @@ namespace Dml
         bool enableMetacommands,
         bool enableBfcAllocator);
 
-    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index d824aa8185705..5c5f8ebf2c3d1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -1137,12 +1137,6 @@ namespace Dml
         return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands, enableBfcAllocator);
     }
 
-    D3D12BufferRegion GetD3D12ResourceRegionFromAllocation(onnxruntime::IAllocator* allocator, void* opaquePointer, uint64_t sizeInBytes)
-    {
-        Dml::DmlGpuAllocator* gpuAllocator = static_cast<Dml::DmlGpuAllocator*>(allocator);
-        return gpuAllocator->CreateBufferRegion(opaquePointer, sizeInBytes);
-    }
-
     void FlushContext(onnxruntime::IExecutionProvider* provider)
     {
         ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index c5f12558e2f63..a3e1b9b040e6e 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -252,47 +252,12 @@ ORT_API_STATUS_IMPL(GetD3D12ResourceFromAllocation, _In_ OrtAllocator* ort_alloc
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(GetD3D12ResourceRegionFromAllocation,
-    _In_ OrtAllocator* ort_allocator,
-    _In_ void* allocation,
-    _In_ uint64_t size_in_bytes,
-    _Out_ ID3D12Resource** d3d_resource,
-    _Out_ uint64_t* offset) {
-  API_IMPL_BEGIN
-#ifdef USE_DML
-  auto wrapping_allocator = static_cast<onnxruntime::OrtAllocatorImplWrappingIAllocator*>(ort_allocator);
-  auto allocator = wrapping_allocator->GetWrappedIAllocator();
-  if (!allocator) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
-  }
-
-  if (wrapping_allocator->Info()->device.MemType() == OrtDevice::MemType::DML_EXTERNAL) {
-    *d3d_resource = static_cast<Dml::AllocationInfo*>(allocation)->GetD3D12Resource();
-    *offset = 0;
-  } else {
-    ORT_THROW_HR_IF(E_INVALIDARG, wrapping_allocator->Info()->device.MemType() != OrtDevice::MemType::DEFAULT);
-    auto bufferRegion = Dml::GetD3D12ResourceRegionFromAllocation(allocator.get(), allocation, size_in_bytes);
-    *offset = bufferRegion.Offset();
-    *d3d_resource = bufferRegion.GetD3D12Resource();
-  }
-
-  (*d3d_resource)->AddRef();
-
-#else
-  *d3d_resource = nullptr;
-  *offset = 0;
-#endif  // USE_DML
-  return nullptr;
-  API_IMPL_END
-}
-
 static constexpr OrtDmlApi ort_dml_api_10_to_x = {
   &OrtSessionOptionsAppendExecutionProvider_DML,
   &OrtSessionOptionsAppendExecutionProviderEx_DML,
   &CreateGPUAllocationFromD3DResource,
   &FreeGPUAllocation,
   &GetD3D12ResourceFromAllocation,
-  &GetD3D12ResourceRegionFromAllocation,
 };
 
 const OrtDmlApi* GetOrtDmlApi(_In_ uint32_t /*version*/) NO_EXCEPTION {
diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
index 6978687f226bf..2654885d6bee8 100644
--- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
+++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-  // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "lib/Api.Image/pch.h"
 
@@ -128,7 +128,6 @@ class ConvertCPUTensorToVideoFrameWithSoftwareBitmapTelemetryEvent {
 };
 
 void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
-  _In_ uint64_t inputTensorOffset,
   _In_ UINT32 batchIdx,
   _In_ winml::LearningModelSession& session,
   _In_ ID3D12Resource* pInputTensor,
@@ -144,20 +143,16 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
   wgi::SoftwareBitmap softwareBitmap = destVideoFrame.SoftwareBitmap();
 
   if (softwareBitmap) {
-    ConvertGPUTensorToSoftwareBitmap(
-      inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap
-    );
+    ConvertGPUTensorToSoftwareBitmap(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, softwareBitmap);
   } else if (spDestDirect3DSurface) {
     bool isUAVSupportedFormat = _winmli::FormatSupportedForUAV(
       pDeviceCache->GetD3D12Device(),
       _winmli::GetDXGIFormatFromDirectXPixelFormat(spDestDirect3DSurface.Description().Format)
     );
 
-        // UAV support for formats is device dependent
+    // UAV support for formats is device dependent
     if (!isUAVSupportedFormat) {
-      ConvertDX12TensorToUnsupportedVideoFrameFormat(
-        inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame
-      );
+      ConvertDX12TensorToUnsupportedVideoFrameFormat(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, destVideoFrame);
     } else {
       ComPtr<ID3D11Texture2D> spVideoFrameTexture =
         _winmli::GetTextureFromDirect3DSurface(destVideoFrame.Direct3DSurface());
@@ -167,7 +162,7 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
       wgi::BitmapBounds bounds = {0, 0, videoFrameTextureDesc.Width, videoFrameTextureDesc.Height};
 
       if (_winmli::TextureIsOnDevice(spVideoFrameTexture.Get(), pDeviceCache->GetD3D11Device())) {
-          // The texture is on our device, so we can just create own texture, share it and cache it
+        // The texture is on our device, so we can just create own texture, share it and cache it
         if (!output_resource_) {
           output_resource_ = CreateShareableD3D12Texture(videoFrameTextureDesc, pDeviceCache->GetD3D12Device());
           D3D11_cached_texture_ = ShareD3D12Texture(output_resource_.Get(), pDeviceCache->GetD3D11Device());
@@ -177,24 +172,22 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
           if (cachedTextureDesc.Width != videoFrameTextureDesc.Width ||
                         cachedTextureDesc.Height != videoFrameTextureDesc.Height ||
                         cachedTextureDesc.Format != videoFrameTextureDesc.Format) {
-              // The dimensions or format don't match, so we need to re-create our texture
+            // The dimensions or format don't match, so we need to re-create our texture
             output_resource_ = CreateShareableD3D12Texture(videoFrameTextureDesc, pDeviceCache->GetD3D12Device());
             D3D11_cached_texture_ = ShareD3D12Texture(output_resource_.Get(), pDeviceCache->GetD3D11Device());
           }
         }
 
-            // Detensorize
-        ConvertGPUTensorToDX12Texture(
-          inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()
-        );
+        // Detensorize
+        ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get());
 
-            // Make sure that detensorization is done
+        // Make sure that detensorization is done
         SyncD3D12ToD3D11(*pDeviceCache, D3D11_cached_texture_.Get());
 
-            // Finally, copy the detensorized texture to the user's device
+        // Finally, copy the detensorized texture to the user's device
         CopyTextureIntoTexture(D3D11_cached_texture_.Get(), bounds, spVideoFrameTexture.Get());
       } else {
-          // We are not on the same device, so we can't rely on our own cached texture
+        // We are not on the same device, so we can't rely on our own cached texture
         ComPtr<ID3D11Device> spTextureDevice;
         spVideoFrameTexture->GetDevice(&spTextureDevice);
 
@@ -209,11 +202,11 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
                      !spSharedD3D11Texture.Get()) ||
                     (FAILED(spVideoFrameTexture->GetPrivateData(_handleGUID, &handleSize, &sharedHandle)) ||
                      sharedHandle != shared_handle_)) {
-            // Create a new shared texture that we cache on the video frame texture
+          // Create a new shared texture that we cache on the video frame texture
           output_resource_ = CreateShareableD3D12Texture(videoFrameTextureDesc, pDeviceCache->GetD3D12Device());
           spSharedD3D11Texture = ShareD3D12Texture(output_resource_.Get(), spTextureDevice.Get());
 
-              // Cache the shared texture on the video frame texture in order to tie their lifetime together
+          // Cache the shared texture on the video frame texture in order to tie their lifetime together
           WINML_THROW_IF_FAILED(
             spVideoFrameTexture->SetPrivateDataInterface(_d3d11TextureGUID, spSharedD3D11Texture.Get())
           );
@@ -222,20 +215,18 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
           );
         }
 
-            // Detensorize
-        ConvertGPUTensorToDX12Texture(
-          inputTensorOffset, batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get()
-        );
+        // Detensorize
+        ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, *pDeviceCache, tensorDesc, output_resource_.Get());
 
-            // Make sure that detensorization is done
+        // Make sure that detensorization is done
         SyncD3D12ToD3D11(*pDeviceCache, spSharedD3D11Texture.Get());
 
-            // Finally, copy the detensorized texture to the user's device
+        // Finally, copy the detensorized texture to the user's device
         CopyTextureIntoTexture(spSharedD3D11Texture.Get(), bounds, spVideoFrameTexture.Get());
       }
     }
   } else {
-      // Invalid video frame
+    // Invalid video frame
     WINML_THROW_HR(E_INVALIDARG);
   }
 }
@@ -266,7 +257,6 @@ ComPtr<ID3D12Resource> TensorToVideoFrameConverter::CreateShareableD3D12Texture(
 }
 
 void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat(
-  _In_ uint64_t input_tensor_offset,
   _In_ UINT32 batchIdx,
   _In_ ID3D12Resource* pInputTensor,
   _In_ _winml::D3DDeviceCache& device_cache,
@@ -275,7 +265,7 @@ void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat
 ) {
   assert(pInputTensor != nullptr);
 
-      // Find the first supported format and convert to it
+  // Find the first supported format and convert to it
   auto supportedFormatIter = std::find_if(
     _winmli::supportedWinMLFormats.begin(),
     _winmli::supportedWinMLFormats.end(),
@@ -321,15 +311,13 @@ void TensorToVideoFrameConverter::ConvertDX12TensorToUnsupportedVideoFrameFormat
   ));
   converted_video_frame_ = wm::VideoFrame::CreateWithDirect3D11Surface(surface);
 
-      // Detensorize
-  ConvertGPUTensorToDX12Texture(
-    input_tensor_offset, batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get()
-  );
+  // Detensorize
+  ConvertGPUTensorToDX12Texture(batchIdx, pInputTensor, device_cache, tensorDesc, output_resource_.Get());
 
-      // Wait for the D3D12 work to complete before using the resource
+  // Wait for the D3D12 work to complete before using the resource
   SyncD3D12ToD3D11(device_cache, spSharedD3D11Texture.Get());
 
-      // Finally, convert and copy the texture to the destination video frame
+  // Finally, convert and copy the texture to the destination video frame
   converted_video_frame_.CopyToAsync(unsupportedVideoFrame).get();
 }
 
@@ -371,13 +359,13 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame(
 
   UINT32 tensorHeight = static_cast<UINT32>(tensorDesc.sizes[2]);
   UINT32 tensorWidth = static_cast<UINT32>(tensorDesc.sizes[3]);
-    // create a bitmap bounds for the whole image/tensor
+  // create a bitmap bounds for the whole image/tensor
   wgi::BitmapBounds inputBounds = {0, 0, tensorWidth, tensorHeight};
 
   wgi::SoftwareBitmap spOutputSoftwareBitmap = pDestVideoFrame.SoftwareBitmap();
   wgdx::Direct3D11::IDirect3DSurface spOutputSurface = pDestVideoFrame.Direct3DSurface();
 
-      // only one of softwarebitmap or direct3Dsurface should be non-null
+  // only one of softwarebitmap or direct3Dsurface should be non-null
   if ((spOutputSoftwareBitmap == nullptr && spOutputSurface == nullptr) || (spOutputSoftwareBitmap != nullptr && spOutputSurface != nullptr)) {
     WINML_THROW_HR(E_INVALIDARG);
   }
@@ -416,7 +404,6 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame(
 }
 
 void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
-  _In_ uint64_t inputTensorOffset,
   _In_ UINT32 batchIdx,
   _In_ ID3D12Resource* pInputResource,
   _In_ _winml::D3DDeviceCache& device_cache,
@@ -433,7 +420,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
   CD3DX12_RECT scissorRect(0, 0, (LONG)outputDesc.Width, outputDesc.Height);
   ComPtr<ID3D12Device> spDx12Device = device_cache.GetD3D12Device();
 
-      // we're inside a lock from the caller of this function, so it's ok to use this static
+  // we're inside a lock from the caller of this function, so it's ok to use this static
   static EventTimer eventTimer;
   std::optional<GPUTensorToDX12TextureTelemetryEvent> telemetryLogger;
   if (eventTimer.Start()) {
@@ -448,7 +435,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     outputDesc.Format
   );
 
-      // Validate input description
+  // Validate input description
   WINML_THROW_HR_IF_FALSE_MSG(
     E_INVALIDARG, inputDesc.Height != 0, "Invalid input image height provided. Height is set to zero."
   );
@@ -456,7 +443,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     E_INVALIDARG, inputDesc.Width != 0, "Invalid input image height provided. Height is set to zero."
   );
 
-      // Validate output description
+  // Validate output description
   WINML_THROW_HR_IF_FALSE_MSG(
     E_INVALIDARG, outputDesc.Height != 0, "Invalid input image height provided. Height is set to zero."
   );
@@ -464,7 +451,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     E_INVALIDARG, outputDesc.Width != 0, "Invalid input image height provided. Height is set to zero."
   );
 
-      // Validate Tensor description
+  // Validate Tensor description
   WINML_THROW_HR_IF_FALSE_MSG(
     E_INVALIDARG,
     tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16,
@@ -504,10 +491,10 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     (UINT)outputDesc.Width
   );
 
-      // Create descriptor heaps
+  // Create descriptor heaps
   UINT srvUavDescriptorSize = spDx12Device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
 
-      // Create a UAV resource for the shader
+  // Create a UAV resource for the shader
   D3D12_RESOURCE_DESC outputResourceDesc = output_resource_->GetDesc();
   outputResourceDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
 
@@ -524,7 +511,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
   }
 
   if (descriptor_heap_ == nullptr) {
-      // Describe and create a shader resource view (SRV) and unordered access view (UAV) descriptor heap.
+    // Describe and create a shader resource view (SRV) and unordered access view (UAV) descriptor heap.
     D3D12_DESCRIPTOR_HEAP_DESC srvUavHeapDesc = {};
     srvUavHeapDesc.NumDescriptors = DescriptorCount;
     srvUavHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
@@ -533,9 +520,9 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     descriptor_heap_->SetName(L"Detensorize Descriptor Heap");
   }
 
-      // Create SRV and UAV for input and output respectively
+  // Create SRV and UAV for input and output respectively
   {
-    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(inputTensorOffset, batchIdx, inputDesc, tensorDesc);
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = CreateSRVDescriptor(batchIdx, inputDesc, tensorDesc);
     CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(
       descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize
     );
@@ -550,15 +537,15 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     spDx12Device->CreateUnorderedAccessView(UAV_resource_.Get(), nullptr, &uavDesc, uavHandle);
   }
 
-      //
-    // Pipeline setup for shader operation
-    //
+  //
+  // Pipeline setup for shader operation
+  //
   PipelineStateCacheType type = PipelineStateCacheType::kFloat32;
   if (tensorDesc.dataType == kImageTensorDataTypeFloat16) {
     type = PipelineStateCacheType::kFloat16;
   }
 
-      // Set the origin format
+  // Set the origin format
   PipelineStateCacheFormat formatFrom = PipelineStateCacheFormat::kBGR8;
   if (tensorDesc.channelType == kImageTensorChannelTypeRGB8) {
     formatFrom = PipelineStateCacheFormat::kRGB8;
@@ -566,7 +553,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
     formatFrom = PipelineStateCacheFormat::kGRAY8;
   }
 
-      // Set the destination format
+  // Set the destination format
   PipelineStateCacheFormat formatTo = PipelineStateCacheFormat::kBGR8;
   if (outputDesc.Format == DXGI_FORMAT_R8G8B8A8_UNORM) {
     formatTo = PipelineStateCacheFormat::kRGB8;
@@ -580,7 +567,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
 
   ResetCommandList(device_cache);
 
-      // Write compute commands into the command list and put it into the queue.
+  // Write compute commands into the command list and put it into the queue.
   {
     command_list_->SetComputeRootSignature(root_signature_.Get());
 
@@ -647,7 +634,6 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToDX12Texture(
 }
 
 void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
-  _In_ uint64_t inputTensorOffset,
   _In_ UINT32 batchIdx,
   _In_ ID3D12Resource* pInputTensor,
   _In_ _winml::D3DDeviceCache& device_cache,
@@ -664,9 +650,9 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
     telemetryLogger.emplace(tensorDesc);
   }
 
-  uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
-  uint64_t singleVideoFramebufferSize =
-    tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize;
+  uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
+  uint32_t singleVideoFramebufferSize =
+    static_cast<uint32_t>(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize);
 
   // TODO: Make an allocator for readback heaps
   if (!readback_heap_ || readback_heap_->GetDesc().Width < singleVideoFramebufferSize) {
@@ -691,7 +677,7 @@ void TensorToVideoFrameConverter::ConvertGPUTensorToSoftwareBitmap(
     readback_heap_.Get(),
     0,
     pInputTensor,
-    inputTensorOffset + singleVideoFramebufferSize * static_cast<uint64_t>(batchIdx),
+    static_cast<uint64_t>(singleVideoFramebufferSize) * batchIdx,
     singleVideoFramebufferSize
   );
 
@@ -766,10 +752,7 @@ void TensorToVideoFrameConverter::ConvertBatchedDX12TensorToBuffers(
 }
 
 D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor(
-  uint64_t offset,
-  const UINT32 batchIdx,
-  const D3D12_RESOURCE_DESC& resourceDesc,
-  const _winml::ImageTensorDescription& desc
+  const UINT32 batchIdx, const D3D12_RESOURCE_DESC& resourceDesc, const _winml::ImageTensorDescription& desc
 ) {
   UINT uiTensorElementSize = desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t);
 
@@ -777,7 +760,7 @@ D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor
   srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
   srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
   UINT singleImageSize = static_cast<UINT>(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]);
-  srvDesc.Buffer.FirstElement = offset + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
+  srvDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
   srvDesc.Buffer.NumElements = singleImageSize;
   srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE;
 
diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index 5ed73a477b32b..b856c6bdbfeca 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "lib/Api.Image/pch.h"
 
@@ -137,7 +137,7 @@ void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor(
   wgi::SoftwareBitmap spInputSoftwareBitmap = inputVideoFrame.SoftwareBitmap();
   wgdx::Direct3D11::IDirect3DSurface spInputSurface = inputVideoFrame.Direct3DSurface();
 
-    // only one of softwarebitmap or direct3Dsurface should be non-null
+  // only one of softwarebitmap or direct3Dsurface should be non-null
   if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) {
     WINML_THROW_IF_FAILED(E_INVALIDARG);
   }
@@ -151,7 +151,7 @@ void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor(
       );
     }
 
-      // Resize the input VideoFrame to converted_video_frame_
+    // Resize the input VideoFrame to converted_video_frame_
     _winmli::ConvertVideoFrameToVideoFrame(
       inputVideoFrame, inputBounds, tensorWidth, tensorHeight, converted_video_frame_
     );
@@ -190,7 +190,6 @@ ComPtr<ID3D12Resource> VideoFrameToTensorConverter::ShareD3D11Texture(
 }
 
 void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
-  _In_ uint64_t outputTensorOffset,
   _In_ const UINT32 batchIdx,
   _In_ winml::LearningModelSession& session,
   _In_ const wm::IVideoFrame& inputVideoFrame,
@@ -198,7 +197,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
   _In_ const ImageTensorDescription& tensorDesc,
   _Inout_ ID3D12Resource* pOutputTensor
 ) {
-   // Validate Tensor description
+  // Validate Tensor description
   WINML_THROW_HR_IF_FALSE_MSG(
     E_INVALIDARG,
     tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16,
@@ -230,9 +229,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
   wgdx::Direct3D11::IDirect3DSurface spDirect3DSurface = inputVideoFrame.Direct3DSurface();
 
   if (inputVideoFrame.SoftwareBitmap()) {
-    ConvertSoftwareBitmapToGPUTensor(
-      batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, outputTensorOffset, pOutputTensor
-    );
+    ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, pOutputTensor);
   } else if (spDirect3DSurface) {
     ComPtr<ID3D11Texture2D> spVideoFrameTexture;
     wgi::BitmapBounds scaledBounds = inputBounds;
@@ -320,9 +317,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
 
     // We cropped the texture, shared it and converted it to a known color format, so it's time to tensorize
     // TODO: merge all videoframes to a single DX12Texture Resource before call ConvertDX12TextureToGPUTensor.
-    ConvertDX12TextureToGPUTensor(
-      outputTensorOffset, batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor
-    );
+    ConvertDX12TextureToGPUTensor(batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor);
   } else {
     // Invalid video frame
     WINML_THROW_IF_FAILED(E_INVALIDARG);
@@ -330,7 +325,6 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
 }
 
 void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
-  _In_ uint64_t output_resource_offset,
   _In_ UINT32 batchIdx,
   _In_ ID3D12Resource* pInputResource,
   _In_ _winml::D3DDeviceCache& device_cache,
@@ -412,6 +406,11 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
 
   // Validate Tensor Resource
   {
+    D3D12_HEAP_PROPERTIES outputHeapProperties;
+    D3D12_HEAP_FLAGS outputHeapFlags;
+
+    WINML_THROW_IF_FAILED(pOutputResource->GetHeapProperties(&outputHeapProperties, &outputHeapFlags));
+
     UINT64 ullNumElementsTensor = 1;
     for (UINT uiIdx = 0; uiIdx < kImageTensorDimensionCountMax; uiIdx++) {
       WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, tensorDesc.sizes[uiIdx], &ullNumElementsTensor));
@@ -423,10 +422,10 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
     UINT64 ullTensorSize = 0;
     WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize));
 
-    if (outputDesc.Width < output_resource_offset + ullTensorSize ||
-        outputDesc.Height != 1 ||
-        outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
-        !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)) {
+    if (outputDesc.Width < ullTensorSize || outputDesc.Height != 1 ||
+            outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
+            !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) ||
+            outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) {
       WINML_THROW_IF_FAILED(E_INVALIDARG);
     }
   }
@@ -467,7 +466,7 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
     );
     spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle);
 
-    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(output_resource_offset, batchIdx, tensorDesc);
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(batchIdx, outputDesc, tensorDesc);
     CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(
       descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), UavBufferIdx, srvUavDescriptorSize
     );
@@ -550,7 +549,6 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   _In_ _winml::D3DDeviceCache& device_cache,
   _In_ const wgi::BitmapBounds& inputBounds,
   _In_ const ImageTensorDescription& tensorDesc,
-  _In_ uint64_t outputResourceOffset,
   _Inout_ ID3D12Resource* pOutputResource
 ) {
   assert(pOutputResource != nullptr);
@@ -593,8 +591,11 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
 
   assert(convertedSoftwareBitmap != nullptr);
 
-  uint64_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
-  uint64_t bufferSize = tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize;
+  D3D12_RESOURCE_DESC outputDesc = pOutputResource->GetDesc();
+
+  uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
+  uint32_t bufferSize =
+    static_cast<uint32_t>(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize);
 
   // TODO: Make an allocator for upload heaps
   if (!upload_heap_ || upload_heap_->GetDesc().Width < bufferSize) {
@@ -625,13 +626,7 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   );
   command_list_->ResourceBarrier(1, &barrier);
 
-  command_list_->CopyBufferRegion(
-    pOutputResource,
-    bufferSize * static_cast<uint64_t>(batchIdx) + outputResourceOffset,
-    upload_heap_.Get(),
-    0,
-    bufferSize
-  );
+  command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize);
 
   WINML_THROW_IF_FAILED(command_list_->Close());
   ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
@@ -689,14 +684,14 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor(
 }
 
 D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescription(
-  uint64_t offset, const UINT32 batchIdx, const _winml::ImageTensorDescription& desc
+  const UINT32 batchIdx, const D3D12_RESOURCE_DESC& resourceDesc, const _winml::ImageTensorDescription& desc
 ) {
   UINT uiTensorElementSize = desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t);
 
   D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
   uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
   UINT singleImageSize = static_cast<UINT>(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]);
-  uavDesc.Buffer.FirstElement = offset / uiTensorElementSize + batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
+  uavDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
   uavDesc.Buffer.NumElements = singleImageSize;
   uavDesc.Buffer.CounterOffsetInBytes = 0;
   uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
diff --git a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
index b82fc8e7a5133..12f676459293b 100644
--- a/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
+++ b/winml/lib/Api.Image/inc/TensorToVideoFrameConverter.h
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #pragma once
 
@@ -12,10 +12,9 @@ class TensorToVideoFrameConverter : public ImageConverter {
  public:
   TensorToVideoFrameConverter() : shared_handle_(nullptr) {}
 
-    // Function takes in a tensor DX12 Resource all compute ops should be completed
-   // converts it to a VideoFrame backed by either a SoftwareBitmap or D3DSurface
+  // Function takes in a tensor DX12 Resource all compute ops should be completed
+  // converts it to a VideoFrame backed by either a SoftwareBitmap or D3DSurface
   void DX12TensorToVideoFrame(
-    _In_ uint64_t inputTensorOffset,
     _In_ UINT32 batch_index,
     _In_ winml::LearningModelSession& session,
     _In_ ID3D12Resource* input_tensor,
@@ -23,8 +22,8 @@ class TensorToVideoFrameConverter : public ImageConverter {
     _Inout_ wm::VideoFrame& destination_video_frame
   );
 
-    // Function takes in a byte pointer to a CPUTensor
-   // converts it to VideoFrame backed by either a SoftwareBitmap or D3DSurface,
+  // Function takes in a byte pointer to a CPUTensor
+  // converts it to VideoFrame backed by either a SoftwareBitmap or D3DSurface,
   void SoftwareTensorToVideoFrame(
     _In_ winml::LearningModelSession& session,
     _In_ BYTE* CPU_tensor_to_convert,
@@ -58,7 +57,6 @@ class TensorToVideoFrameConverter : public ImageConverter {
   Microsoft::WRL::ComPtr<ID3D11Texture2D> ShareD3D12Texture(ID3D12Resource* pResource, ID3D11Device* pDevice);
 
   void ConvertGPUTensorToSoftwareBitmap(
-    _In_ uint64_t inputTensorOffset,
     _In_ UINT32 batch_index,
     _In_ ID3D12Resource* input_tensor,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -67,7 +65,6 @@ class TensorToVideoFrameConverter : public ImageConverter {
   );
 
   void ConvertGPUTensorToDX12Texture(
-    _In_ uint64_t inputTensorOffset,
     _In_ UINT32 batch_index,
     _In_ ID3D12Resource* input_resource,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -76,7 +73,6 @@ class TensorToVideoFrameConverter : public ImageConverter {
   );
 
   void ConvertDX12TensorToUnsupportedVideoFrameFormat(
-    _In_ uint64_t input_tensor_offset,
     _In_ UINT32 batch_index,
     _In_ ID3D12Resource* input_tensor,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -85,10 +81,7 @@ class TensorToVideoFrameConverter : public ImageConverter {
   );
 
   static D3D12_SHADER_RESOURCE_VIEW_DESC TensorToVideoFrameConverter::CreateSRVDescriptor(
-    uint64_t offset,
-    const UINT32 batch_index,
-    const D3D12_RESOURCE_DESC& resource_description,
-    const ImageTensorDescription& description
+    const UINT32 batch_index, const D3D12_RESOURCE_DESC& resource_description, const ImageTensorDescription& description
   );
 
   static void ConvertCPUTensorToSoftwareBitmap(
diff --git a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
index 4433dfaab299d..e34030bbd6833 100644
--- a/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
+++ b/winml/lib/Api.Image/inc/VideoFrameToTensorConverter.h
@@ -21,7 +21,6 @@ class VideoFrameToTensorConverter : public ImageConverter {
   // {upperleft X, upperleft Y, width, height} to be turned into a tensor.
   // If the region of interest is the entire VideoFrame, the input BitmapBounds should describe the entire image.
   void VideoFrameToDX12Tensor(
-    _In_ uint64_t output_tensor_offset,
     _In_ const UINT32 batch_index,
     _In_ winml::LearningModelSession& session,
     _In_ const wm::IVideoFrame& input_video_frame,
@@ -72,12 +71,10 @@ class VideoFrameToTensorConverter : public ImageConverter {
     _In_ _winml::D3DDeviceCache& device_cache,
     _In_ const wgi::BitmapBounds& input_bounds,
     _In_ const ImageTensorDescription& tensor_description,
-    _In_ uint64_t outputResourceOffset,
     _Inout_ ID3D12Resource* pOutputResource
   );
 
   void ConvertDX12TextureToGPUTensor(
-    _In_ uint64_t output_resource_offset,
     _In_ const UINT32 batch_index,
     _In_ ID3D12Resource* pInputResource,
     _In_ _winml::D3DDeviceCache& device_cache,
@@ -86,7 +83,7 @@ class VideoFrameToTensorConverter : public ImageConverter {
   );
 
   static D3D12_UNORDERED_ACCESS_VIEW_DESC CreateUAVDescription(
-    uint64_t offset, const UINT32 batch_index, const ImageTensorDescription& description
+    const UINT32 batch_index, const D3D12_RESOURCE_DESC& resource_description, const ImageTensorDescription& description
   );
 
   static void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor(
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
index efaa3685d45cf..4d0915ab13af8 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 #include "lib/Api.Ort/pch.h"
 
 #include "OnnxruntimeEngine.h"
@@ -108,12 +108,12 @@ HRESULT OnnxruntimeValue::IsCpu(bool* out) {
 }
 
 static uint64_t ShapeSize(const int64_t* shape, size_t count) {
-   // for each dim
+  // for each dim
   int64_t size = 1;
   for (size_t i = 0; i < count; i++) {
-     // find out it's total size
+    // find out it's total size
     size *= shape[i];
-     // make sure there are no invalid dimensions (-1 or any invalid shape)
+    // make sure there are no invalid dimensions (-1 or any invalid shape)
     THROW_HR_IF(E_INVALIDARG, shape[i] <= 0);
   }
   return size;
@@ -134,7 +134,7 @@ static auto GetStrings(
   }
   auto length = ShapeSize(shape.data(), shape.size());
 
-    // make a big buffer to hold all the string data
+  // make a big buffer to hold all the string data
   size_t buffer_length;
   THROW_IF_NOT_OK_MSG(ort_api->GetStringTensorDataLength(ort_value, &buffer_length), ort_api);
 
@@ -146,10 +146,10 @@ static auto GetStrings(
     ort_api->GetStringTensorContent(ort_value, buffer.get(), buffer_length, offsets.data(), offsets.size()), ort_api
   );
 
-    // now go build all the strings
+  // now go build all the strings
   for (size_t i = 0; i < length; ++i) {
     size_t str_len = 0;
-     // are we on the last one?
+    // are we on the last one?
     if (i == (length - 1)) {
       str_len = buffer_length - offsets[i];
     } else {
@@ -161,7 +161,7 @@ static auto GetStrings(
   return std::make_shared<std::pair<decltype(strings), decltype(buffer)>>(std::move(strings), std::move(buffer));
 }
 
-HRESULT OnnxruntimeValue::GetResource(uint64_t size_in_bytes, _winml::Resource& out, uint64_t& offset) {
+HRESULT OnnxruntimeValue::GetResource(_winml::Resource& out) {
   auto ort_api = engine_->GetEngineFactory()->UseOrtApi();
 
   void* mutable_data = nullptr;
@@ -185,10 +185,7 @@ HRESULT OnnxruntimeValue::GetResource(uint64_t size_in_bytes, _winml::Resource&
 
     winrt::com_ptr<ID3D12Resource> resource;
     RETURN_HR_IF_NOT_OK_MSG(
-      ort_dml_api->GetD3D12ResourceRegionFromAllocation(
-        allocator.get(), mutable_data, size_in_bytes, resource.put(), &offset
-      ),
-      ort_api
+      ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), mutable_data, resource.put()), ort_api
     );
     out = _winml::Resource(resource.get(), [](void*) { /*do nothing, as this pointer is actually a com pointer! */ });
   } else {
@@ -1406,11 +1403,10 @@ HRESULT OnnxruntimeEngine::FillFromMapValue(
   std::vector<int64_t> keys_shape;
   keys_value->GetTensorShape(keys_shape);
 
-  uint64_t offset = 0;
   _winml::Resource keys_data;
-  RETURN_IF_FAILED(keys_value->GetResource(0, keys_data, offset));
+  RETURN_IF_FAILED(keys_value->GetResource(keys_data));
   _winml::Resource values_data;
-  RETURN_IF_FAILED(values_value->GetResource(0, values_data, offset));
+  RETURN_IF_FAILED(values_value->GetResource(values_data));
 
   auto num_elements = static_cast<size_t>(ShapeSize(keys_shape.data(), keys_shape.size()));
   GetAbiMapFiller(key_kind, value_kind)(map, num_elements, keys_data.get(), values_data.get());
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h
index 0fb4aa73a1a96..5974d46b82c4f 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h
@@ -31,7 +31,7 @@ class OnnxruntimeValue
   STDMETHOD(IsCpu)
   (bool* out) override;
   STDMETHOD(GetResource)
-  (uint64_t size_in_bytes, _winml::Resource& resource, uint64_t& offset) override;
+  (_winml::Resource& resource) override;
   STDMETHOD(IsTensor)
   (bool* out) override;
   STDMETHOD(IsOfTensorType)
diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp
index 4f824c072e8a2..3e36092ad5ebe 100644
--- a/winml/lib/Api/ImageFeatureValue.cpp
+++ b/winml/lib/Api/ImageFeatureValue.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "lib/Api/pch/pch.h"
 #include "ImageFeatureValue.h"
@@ -47,7 +47,7 @@ WINML_CATCH_ALL
 void ImageFeatureValue::Initialize() {
   m_batchSize = m_videoFrames.Size();
   for (auto videoFrame : m_videoFrames) {
-     // TODO: Check all videoFrames come from either CPU or GPU.
+    // TODO: Check all videoFrames come from either CPU or GPU.
     if (auto surface = videoFrame.Direct3DSurface()) {
       wgdx::Direct3D11::Direct3DSurfaceDescription description = surface.Description();
       m_widths.emplace_back(description.Width);
@@ -148,15 +148,15 @@ wgi::BitmapBounds ImageFeatureValue::CenterAndCropBounds(uint32_t idx, uint32_t
   wgi::BitmapBounds bounds = {};
   float RequiredAspectRatio = static_cast<float>(desiredWidth) / static_cast<float>(desiredHeight);
 
-    // crop to center while maintaining size
+  // crop to center while maintaining size
   if (RequiredAspectRatio * m_heights[idx] < m_widths[idx]) {
-     // actual width is too wide. Cut off left and right of image
+    // actual width is too wide. Cut off left and right of image
     bounds.Width = std::min((UINT)(RequiredAspectRatio * m_heights[idx] + 0.5f), m_widths[idx]);
     bounds.Height = m_heights[idx];
     bounds.X = (m_widths[idx] - bounds.Width) / 2;
     bounds.Y = 0;
   } else {
-     // actual height is too long. Cut off top and bottom
+    // actual height is too long. Cut off top and bottom
     bounds.Width = m_widths[idx];
     bounds.Height = std::min((UINT)(m_widths[idx] / RequiredAspectRatio + 0.5f), m_heights[idx]);
     bounds.X = 0;
@@ -249,14 +249,14 @@ static void CPUTensorize(
 
   auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->TensorizerStore()->Fetch(descriptor));
 
-    //apply tensorization
+  //apply tensorization
   pooledConverter->Get()->Tensorizer->VideoFrameToSoftwareTensor(
     videoFrame, bounds, tensorDescriptor, reinterpret_cast<BYTE*>(pResource)
   );
 
-    // Software tensorization doesnt need to hold onto any resources beyond its scope, so we can
-   // return the converter to the pool on tensorization completion.
-   // (This happens automatically in the destruction of PoolObjectWrapper)
+  // Software tensorization doesnt need to hold onto any resources beyond its scope, so we can
+  // return the converter to the pool on tensorization completion.
+  // (This happens automatically in the destruction of PoolObjectWrapper)
 }
 
 static void CPUTensorize(
@@ -267,7 +267,7 @@ static void CPUTensorize(
   BYTE* resource,
   unsigned int singleFrameBufferSize
 ) {
-   // Tensorize video frames one by one without extra copy.
+  // Tensorize video frames one by one without extra copy.
   for (uint32_t batchIdx = 0; batchIdx < videoFrames.Size(); ++batchIdx) {
     CPUTensorize(videoFrames.GetAt(batchIdx), bounds[batchIdx], tensorDescriptor, spSession, resource);
     resource += singleFrameBufferSize;
@@ -280,7 +280,6 @@ static void GPUTensorize(
   _winml::ImageTensorDescription tensorDescriptor,
   com_ptr<LearningModelSession> spSession,
   ID3D12Resource* d3dResource,
-  uint64_t resourceOffset,
   _winml::BindingContext& context
 ) {
   auto spDevice = spSession->Device().as<LearningModelDevice>();
@@ -291,24 +290,24 @@ static void GPUTensorize(
   descriptor.height = static_cast<int>(tensorDescriptor.sizes[2]);
   descriptor.luid = spDevice->GetD3DDevice()->GetAdapterLuid();  // Converted image on GPU
 
-    // Tensorize video frames one by one without extra copy.
+  // Tensorize video frames one by one without extra copy.
   for (uint32_t batchIdx = 0; batchIdx < videoFrames.Size(); ++batchIdx) {
     auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->TensorizerStore()->Fetch(descriptor));
     {
-       // Apply tensorization
+      // Apply tensorization
       auto session = spSession.as<winml::LearningModelSession>();
       pooledConverter->Get()->Tensorizer->VideoFrameToDX12Tensor(
-        resourceOffset, batchIdx, session, videoFrames.GetAt(batchIdx), bounds[batchIdx], tensorDescriptor, d3dResource
+        batchIdx, session, videoFrames.GetAt(batchIdx), bounds[batchIdx], tensorDescriptor, d3dResource
       );
 
-        // Tensorization to a GPU tensor will run asynchronously and associated resources
-       // need to be kept alive until the gpu resources have been used in the queue.
-       //
-       // The PoolObjectWrapper needs to stay alive so that the underlying resources are
-       // not released to the cache.
-       //
-       // This object will be returned to the cache when evaluate has completed. So we cache this
-       // on the binding context.
+      // Tensorization to a GPU tensor will run asynchronously and associated resources
+      // need to be kept alive until the gpu resources have been used in the queue.
+      //
+      // The PoolObjectWrapper needs to stay alive so that the underlying resources are
+      // not released to the cache.
+      //
+      // This object will be returned to the cache when evaluate has completed. So we cache this
+      // on the binding context.
       context.converter = pooledConverter;
     }
   }
@@ -324,13 +323,13 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
   auto spImageDescriptor = context.descriptor.try_as<ImageFeatureDescriptor>();
   auto spTensorDescriptor = context.descriptor.try_as<TensorFeatureDescriptor>();
 
-    // Set up descriptorWidth and descriptorHeight
+  // Set up descriptorWidth and descriptorHeight
   if (spImageDescriptor) {
-     // If model expects free dimensions the descritpr will have MAXUINT32, and we use the supplied image
+    // If model expects free dimensions the descritpr will have MAXUINT32, and we use the supplied image
 
-      // If the width or height in model metadata is -1, which means free dimension.
-     // The the widths and heights of input data must be the same. Or the
-     // tensorDescriptor cannot describ the shape of the inputs.
+    // If the width or height in model metadata is -1, which means free dimension.
+    // The the widths and heights of input data must be the same. Or the
+    // tensorDescriptor cannot describ the shape of the inputs.
     if (spImageDescriptor->Width() == MAXUINT32 &&
             !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to<uint32_t>()) == m_widths.end())) {
       THROW_HR(E_INVALIDARG);
@@ -344,7 +343,7 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
     descriptorHeight = (spImageDescriptor->Height() == MAXUINT32) ? m_heights[0] : spImageDescriptor->Height();
     tensorKind = spImageDescriptor->TensorKind();
   } else if (spTensorDescriptor) {
-     // If model expects a tensor, use its shape
+    // If model expects a tensor, use its shape
     auto shape = spTensorDescriptor->Shape();
 
     if (shape.Size() != 4) {
@@ -370,28 +369,28 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
     return {};
   }
 
-    // Set up BitmapBounds
-   // For batch of images with different sizes, like { {1, 3, 1080, 1080}, {1, 3, 720, 720} },
-   // a vector of bounds is to record the result after cropped.
+  // Set up BitmapBounds
+  // For batch of images with different sizes, like { {1, 3, 1080, 1080}, {1, 3, 720, 720} },
+  // a vector of bounds is to record the result after cropped.
   std::vector<wgi::BitmapBounds> bounds = {};
   for (uint32_t i = 0; i < m_batchSize; ++i) {
     auto tempBounds = GetBoundsFromMetadata(context.properties);
     if (!tempBounds.has_value()) {
-       // If the user has not specified bounds, we need to infer the bounds
-       // from the combination of descriptor, and input value or output value
+      // If the user has not specified bounds, we need to infer the bounds
+      // from the combination of descriptor, and input value or output value
       if (context.type == _winml::BindingType::kInput) {
-         // If unspecified output, get the crop with correct aspect ratio
+        // If unspecified output, get the crop with correct aspect ratio
         tempBounds = CenterAndCropBounds(i, descriptorWidth, descriptorHeight);
       } else {
-         // If given an unspecified output region, write into the top left portion of the output image.
+        // If given an unspecified output region, write into the top left portion of the output image.
         tempBounds = wgi::BitmapBounds{0, 0, m_widths[i], m_heights[i]};
       }
     }
     bounds.emplace_back(tempBounds.value());
   }
-   // TODO: Validate Bounds
+  // TODO: Validate Bounds
 
-    // Set up BitmapPixelFormat
+  // Set up BitmapPixelFormat
   auto pixelFormat = std::optional<wgi::BitmapPixelFormat>{};
   pixelFormat = GetBitmapPixelFormatFromMetadata(context.properties);
   if (!pixelFormat.has_value() && spImageDescriptor) {
@@ -400,23 +399,23 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
     auto shape = spTensorDescriptor->Shape();
     int channelCount = static_cast<uint32_t>(shape.GetAt(1));
     if (channelCount == 1) {
-       // Assume Gray if no image descriptor is given and channelcount 1
+      // Assume Gray if no image descriptor is given and channelcount 1
       pixelFormat = wgi::BitmapPixelFormat::Gray8;
 
     } else if (channelCount == 3) {
-       // Assume Bgra8 if no image descriptor is given
+      // Assume Bgra8 if no image descriptor is given
       pixelFormat = wgi::BitmapPixelFormat::Bgra8;
     } else {
       THROW_HR(WINML_ERR_SIZE_MISMATCH);
     }
   }
 
-    // Set up LearningModelPixelRange
+  // Set up LearningModelPixelRange
   auto pixelRange = std::optional<winml::LearningModelPixelRange>{};
   pixelRange = GetBitmapPixelRangeFromMetadata(context.properties);
   if (pixelRange.has_value()) {
-     // The pixel range was set by the bind properties, skip all checks and honor
-     // the user provided normalization property. Do nothing.
+    // The pixel range was set by the bind properties, skip all checks and honor
+    // the user provided normalization property. Do nothing.
   } else if (!pixelRange.has_value() && spImageDescriptor) {
     pixelRange = spImageDescriptor->PixelRange();
   } else if (!pixelRange.has_value() && spTensorDescriptor) {
@@ -437,17 +436,17 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa
   FAIL_FAST_IF(!(std::all_of(m_widths.begin(), m_widths.end(), [](int i) { return i != 0; })));
   FAIL_FAST_IF(!(std::all_of(m_heights.begin(), m_heights.end(), [](int i) { return i != 0; })));
 
-    // Get image metadata from the binding context
+  // Get image metadata from the binding context
   auto metadata = GetInputMetadata(context);
   RETURN_HR_IF(E_INVALIDARG, !metadata);
   ImageResourceMetadata resourceMetadata = metadata.value();
 
-    // Get the session
+  // Get the session
   auto spSession = context.session.as<LearningModelSession>();
   auto spDevice = spSession->Device().as<LearningModelDevice>();
   auto engine = spSession->GetEngine();
 
-    // create the OrtValue
+  // create the OrtValue
   winrt::com_ptr<_winml::IValue> value;
   RETURN_IF_FAILED(engine->CreateTensorValue(
     resourceMetadata.TensorDescriptor.sizes,
@@ -458,21 +457,19 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa
     value.put()
   ));
 
-  auto bufferSize = std::accumulate(
-    std::begin(resourceMetadata.TensorDescriptor.sizes),
-    std::end(resourceMetadata.TensorDescriptor.sizes),
-    static_cast<int64_t>(1),
-    std::multiplies<int64_t>()
-  );
-  auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize;
-
-    // Get the tensor raw data
+  // Get the tensor raw data
   _winml::Resource void_resource;
-  uint64_t offset = 0;
-  RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset));
+  RETURN_IF_FAILED(value->GetResource(void_resource));
 
   if (context.type == _winml::BindingType::kInput) {
-     // Only tensorize inputs
+    // Only tensorize inputs
+    auto bufferSize = std::accumulate(
+      std::begin(resourceMetadata.TensorDescriptor.sizes),
+      std::end(resourceMetadata.TensorDescriptor.sizes),
+      static_cast<int64_t>(1),
+      std::multiplies<int64_t>()
+    );
+    auto bufferByteSize = GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize;
     auto singleFrameBufferSize = bufferByteSize / m_batchSize;
     if (spDevice->IsCpuDevice()) {
       auto resource = reinterpret_cast<BYTE*>(void_resource.get());
@@ -487,7 +484,7 @@ HRESULT ImageFeatureValue::GetValue(_winml::BindingContext& context, _winml::IVa
     } else {
       auto resource = reinterpret_cast<ID3D12Resource*>(void_resource.get());
       GPUTensorize(
-        m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, offset, context
+        m_videoFrames, resourceMetadata.Bounds, resourceMetadata.TensorDescriptor, spSession, resource, context
       );
     }
   }
@@ -504,28 +501,18 @@ HRESULT ImageFeatureValue::IsPlaceholder(bool* pIsPlaceHolder) {
 }
 
 HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& context, _winml::IValue* value) try {
-   // Get the device
+  // Get the device
   auto spSession = context.session.as<LearningModelSession>();
   auto spDevice = spSession->Device().as<LearningModelDevice>();
 
-    // Get the run context
+  // Get the output tensor raw data
+  _winml::Resource void_resource;
+  RETURN_IF_FAILED(value->GetResource(void_resource));
+
+  // Get the run context
   auto metadata = GetInputMetadata(context);
   ImageResourceMetadata resourceMetadata = metadata.value();
 
-  auto bufferSize = std::accumulate(
-    std::begin(resourceMetadata.TensorDescriptor.sizes),
-    std::end(resourceMetadata.TensorDescriptor.sizes),
-    static_cast<int64_t>(1),
-    std::multiplies<int64_t>()
-  );
-  auto bufferByteSize =
-    GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize;
-
-    // Get the output tensor raw data
-  _winml::Resource void_resource;
-  uint64_t offset = 0;
-  RETURN_IF_FAILED(value->GetResource(bufferByteSize, void_resource, offset));
-
   _winml::ConverterResourceDescription descriptor = {};
   descriptor.width = static_cast<int>(resourceMetadata.TensorDescriptor.sizes[3]);
   descriptor.height = static_cast<int>(resourceMetadata.TensorDescriptor.sizes[2]);
@@ -537,9 +524,18 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont
 
     auto pooledConverter = _winml::PoolObjectWrapper::Create(spDevice->DetensorizerStore()->Fetch(descriptor));
 
+    auto bufferSize = std::accumulate(
+      std::begin(resourceMetadata.TensorDescriptor.sizes),
+      std::end(resourceMetadata.TensorDescriptor.sizes),
+      static_cast<int64_t>(1),
+      std::multiplies<int64_t>()
+    );
+    auto bufferByteSize =
+      GetSizeFromTensorDataType(resourceMetadata.TensorDescriptor.dataType) * bufferSize / m_batchSize;
+
     BYTE* resource = reinterpret_cast<BYTE*>(void_resource.get());
     for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) {
-       // Convert Software Tensor to VideoFrame one by one based on the buffer size.
+      // Convert Software Tensor to VideoFrame one by one based on the buffer size.
       auto videoFrame = m_videoFrames.GetAt(batchIdx);
       pooledConverter->Get()->Detensorizer->SoftwareTensorToVideoFrame(
         context.session, resource, resourceMetadata.TensorDescriptor, videoFrame
@@ -557,7 +553,7 @@ HRESULT ImageFeatureValue::UpdateSourceResourceData(_winml::BindingContext& cont
     for (uint32_t batchIdx = 0; batchIdx < m_batchSize; ++batchIdx) {
       auto videoFrame = m_videoFrames.GetAt(batchIdx);
       pooledConverter->Get()->Detensorizer->DX12TensorToVideoFrame(
-        offset, batchIdx, context.session, d3dResource, resourceMetadata.TensorDescriptor, videoFrame
+        batchIdx, context.session, d3dResource, resourceMetadata.TensorDescriptor, videoFrame
       );
 
       // Reset the Allocator before return to the Cache. Must Sync this background thread to that completion before we do.
diff --git a/winml/lib/Api/ImageFeatureValue.h b/winml/lib/Api/ImageFeatureValue.h
index 83a21c8679cf3..92f3cab43b432 100644
--- a/winml/lib/Api/ImageFeatureValue.h
+++ b/winml/lib/Api/ImageFeatureValue.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+﻿﻿// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #pragma once
diff --git a/winml/lib/Api/impl/TensorBase.h b/winml/lib/Api/impl/TensorBase.h
index b8cdf6f66a587..c9299a00ddaa2 100644
--- a/winml/lib/Api/impl/TensorBase.h
+++ b/winml/lib/Api/impl/TensorBase.h
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #pragma once
 
@@ -26,27 +26,27 @@
 
 namespace _winml {
 
-  // TensorBase
- //
- // This is the base class for all data based Tensor types. It exposes array and IVectorView
- // based getter and setters.
- //
- // Look in FeatureValue.h to see where all of them actually get created with CREATE_TENSOR()
- //
- // Supported derived classes:
- //    Float, Int8, UInt8, UInt16, Int16, Int32, Int64, Boolean, Double, UInt32, UInt64
- //
- // Unsupported types
- //    Float16 and String have different access patterns and Int8, Complex64, Complex128 are unsupported
- //
+// TensorBase
+//
+// This is the base class for all data based Tensor types. It exposes array and IVectorView
+// based getter and setters.
+//
+// Look in FeatureValue.h to see where all of them actually get created with CREATE_TENSOR()
+//
+// Supported derived classes:
+//    Float, Int8, UInt8, UInt16, Int16, Int32, Int64, Boolean, Double, UInt32, UInt64
+//
+// Unsupported types
+//    Float16 and String have different access patterns and Int8, Complex64, Complex128 are unsupported
+//
 template <typename T, typename ViewT, typename TDerived, typename TInterface, typename TBase>
 struct TensorBase : TBase {
   template <typename ElementType = T, typename ElementViewType = ViewT>
   static void ASSERT_TEMPLATE_PARAMETERS() {
-     // This adds compile time checks that ensure that the API can only be called when:
-     //   1) the first template parameter matches the internal type (T),
-     //      since the api attempts copy the tensor memory of type T into a vector of type ElementType.
-     //   2) the second template parameter matches the return type
+    // This adds compile time checks that ensure that the API can only be called when:
+    //   1) the first template parameter matches the internal type (T),
+    //      since the api attempts copy the tensor memory of type T into a vector of type ElementType.
+    //   2) the second template parameter matches the return type
     static_assert(
       std::is_same<T, ElementType>::value,
       "This API can only be called with template parameters that match its internal data type T."
@@ -59,9 +59,9 @@ struct TensorBase : TBase {
 
   template <typename ElementType = T, typename ElementViewType = ViewT>
   static void ASSERT_TEMPLATE_PARAMETERS_EXACT() {
-     // This adds compile time checks that ensure that the API can only be called when:
-     //   1) the conditions of ASSERT_TEMPLATE_PARAMETERS() are met.
-     //   2) the ABI type (ViewT) matches the internal type (t).
+    // This adds compile time checks that ensure that the API can only be called when:
+    //   1) the conditions of ASSERT_TEMPLATE_PARAMETERS() are met.
+    //   2) the ABI type (ViewT) matches the internal type (t).
     ASSERT_TEMPLATE_PARAMETERS<ElementType, ElementViewType>();
 
     static_assert(
@@ -70,18 +70,18 @@ struct TensorBase : TBase {
     );
   }
 
-    /// On creation, tensors can either:
-   ///  1) act as a placeholder without any backing memory (output tensors, chained values). In this case we
-   ///     create the backing memory when the buffer is accessed. The buffer is allocated one of there scenarios:
-   ///         GPUTensorize during binding (used to create DML resources for chaining)
-   ///         UpdateSourceResourceData after eval (used for output placeholder tensors or unbound outputs)
-   ///         GetBuffer when accessed by users
-   ///    a) TensorBase()
-   ///  2) allocate backing cpu memory (when a shape is provided)
-   ///    a) TensorBase(std::vector<int64_t> const& shape)
-   ///    b) TensorBase(winrt::Windows::Foundation::Collections::IIterable<int64_t> const& shape)
-   ///  3) use provided backing gpu memory
-   ///    a) TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* pResource)
+  /// On creation, tensors can either:
+  ///  1) act as a placeholder without any backing memory (output tensors, chained values). In this case we
+  ///     create the backing memory when the buffer is accessed. The buffer is allocated one of there scenarios:
+  ///         GPUTensorize during binding (used to create DML resources for chaining)
+  ///         UpdateSourceResourceData after eval (used for output placeholder tensors or unbound outputs)
+  ///         GetBuffer when accessed by users
+  ///    a) TensorBase()
+  ///  2) allocate backing cpu memory (when a shape is provided)
+  ///    a) TensorBase(std::vector<int64_t> const& shape)
+  ///    b) TensorBase(winrt::Windows::Foundation::Collections::IIterable<int64_t> const& shape)
+  ///  3) use provided backing gpu memory
+  ///    a) TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* pResource)
   TensorBase() : resources_(std::make_shared<TensorResources<T>>()) {}
 
   TensorBase(wfc::IIterable<int64_t> const& shape)
@@ -97,7 +97,7 @@ struct TensorBase : TBase {
   TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* resource)
     : shape_(shape),
       resources_(std::make_shared<TensorResources<T>>()) {
-     // This Api is not supported for TensorString
+    // This Api is not supported for TensorString
     WINML_THROW_HR_IF_TRUE_MSG(
       E_ILLEGAL_METHOD_CALL,
       (std::is_same<T, std::string>::value),
@@ -132,7 +132,7 @@ struct TensorBase : TBase {
       return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out);
     }
 
-      // If there is no matching cpu resource, then fallback to a gpu resource
+    // If there is no matching cpu resource, then fallback to a gpu resource
     if (GpuTensor() != nullptr) {
       return CreateGPUMLValue(GpuTensor().get(), context, out);
     }
@@ -145,18 +145,18 @@ struct TensorBase : TBase {
       return CreateGPUMLValue(GpuTensor().get(), context, out);
     }
 
-      // Get engine
+    // Get engine
     auto session = context.session.as<winmlp::LearningModelSession>();
     auto device = session->Device().as<winmlp::LearningModelDevice>();
     auto engine = session->GetEngine();
 
     auto should_sync_buffer = context.type == _winml::BindingType::kInput;
 
-      // If there is no matching gpu resource, then fallback to a cpu resource
+    // If there is no matching gpu resource, then fallback to a cpu resource
     if (CpuTensor() != nullptr) {
       auto num_backing_buffers = CpuTensor()->num_buffers();
       if (num_backing_buffers == 1) {
-         // If we have a single backing cpu buffer, there is no need to create GPU resources.
+        // If we have a single backing cpu buffer, there is no need to create GPU resources.
         // The engine will use the buffer provided, and perform the needed copies into the GPU context as needed.
         return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out);
       } else {
@@ -374,13 +374,11 @@ struct TensorBase : TBase {
       "The tensor has been closed and its resources have been detached during evaluation!"
     );
 
+    _winml::Resource updated_resource;
+    RETURN_IF_FAILED(value->GetResource(updated_resource));
+
     // get the shape
     RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!");
-    auto buffer_size_in_bytes = static_cast<size_t>(ShapeSize(shape_)) * sizeof(T);
-
-    _winml::Resource updated_resource;
-    uint64_t offset = 0;
-    RETURN_IF_FAILED(value->GetResource(buffer_size_in_bytes, updated_resource, offset));
 
     bool is_cpu;
     bool isCpuOutput = SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu;
@@ -424,6 +422,8 @@ struct TensorBase : TBase {
         );
         RETURN_IF_FAILED(engine->CopyValueAcrossDevices(value, dest.get()));
       } else {
+        auto buffer_size_in_bytes = static_cast<size_t>(ShapeSize(shape_)) * sizeof(T);
+
         _winml::ConverterResourceDescription descriptor = {};
         descriptor.pixel_format = static_cast<DWORD>(wgdx::DirectXPixelFormat::Unknown);
         descriptor.luid = device->GetD3DDevice()->GetAdapterLuid();  // Converted image on GPU
diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h
index 0a944315a1dc4..1aa857383a3b5 100644
--- a/winml/lib/Common/inc/iengine.h
+++ b/winml/lib/Common/inc/iengine.h
@@ -21,7 +21,7 @@ IValue : IUnknown {
   (bool* out) PURE;
 
   STDMETHOD(GetResource)
-  (uint64_t size_in_bytes, _winml::Resource & resource, uint64_t& offset) PURE;
+  (_winml::Resource & resource) PURE;
 
   STDMETHOD(IsTensor)
   (bool* out) PURE;
diff --git a/winml/test/common/SqueezeNetValidator.cpp b/winml/test/common/SqueezeNetValidator.cpp
index d0c43c1c9775e..2a6b3843c423c 100644
--- a/winml/test/common/SqueezeNetValidator.cpp
+++ b/winml/test/common/SqueezeNetValidator.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "SqueezeNetValidator.h"
 #include "protobufHelpers.h"
@@ -104,7 +104,7 @@ void ModelValidator::FnsCandy16(
   float dataTolerance
 ) {
   ORT_UNUSED_PARAMETER(dataTolerance);
-  // file name strings
+    // file name strings
   static wchar_t* modelFileName = L"winmlperf_coreml_FNS-Candy_prerelease_fp16.onnx";
   static wchar_t* inputDataImageFileName = L"fish_720.png";
   static wchar_t* outputDataFileName = L"output.png";
@@ -115,7 +115,7 @@ void ModelValidator::FnsCandy16(
   auto fullModelPath = modulePath + modelFileName;
   auto outputFileName = modulePath + outputDataFileName;
 
-  // WinML model creation
+    // WinML model creation
   LearningModel model = nullptr;
   model = LearningModel::LoadFromFilePath(fullModelPath);
 
@@ -126,7 +126,7 @@ void ModelValidator::FnsCandy16(
   auto fullImagePath = modulePath + inputDataImageFileName;
   BindImage(modelBinding, inputBindingName, fullImagePath.c_str(), bindInputsAsIInspectable);
 
-  // create the tensor for the actual output
+    // create the tensor for the actual output
   auto output = model.OutputFeatures().First().Current();
   if (output.Kind() != LearningModelFeatureKind::Tensor) {
     throw winrt::hresult_invalid_argument(L"Model output kind is not type Tensor");
@@ -135,16 +135,16 @@ void ModelValidator::FnsCandy16(
   auto shape = winrt::single_threaded_vector(std::vector<int64_t>{1, 1});
   auto outputTensor = BindImageOutput(outputBindingStrategy, modelBinding, outputDataBindingName);
 
-  // Evaluate the model
+    // Evaluate the model
   std::cout << "Calling EvaluateSync on instance" << instance << "\n";
   LearningModelEvaluationResult result = nullptr;
   result = modelSession.Evaluate(modelBinding, {});
 
-  // Get results
+    // Get results
   if (outputBindingStrategy == OutputBindingStrategy::Unbound) {
-    // When output binding strategy is unbound, the output tensor was not set on bind.
-    // Therefore, we need to retrieve it from the LearnignModelEvaluationResult
-    // TODO: is this right? outputTensorT is unused...
+        // When output binding strategy is unbound, the output tensor was not set on bind.
+        // Therefore, we need to retrieve it from the LearnignModelEvaluationResult
+        // TODO: is this right? outputTensorT is unused...
     /*auto outputTensorT = */ result.Outputs().Lookup(outputDataBindingName).as<TensorFloat16Bit>();
   } else {
     if (result.Outputs().Lookup(outputDataBindingName) != outputTensor) {
@@ -171,7 +171,7 @@ void ModelValidator::SqueezeNet(
   OutputBindingStrategy outputBindingStrategy,
   bool bindInputsAsIInspectable
 ) {
-  // file name strings
+    // file name strings
   static wchar_t* modelFileName = L"model.onnx";
   static wchar_t* inputDataFileName = L"test_data_0_input.pb";
   static wchar_t* outputDataFileName = L"test_data_0_output.pb";
@@ -183,7 +183,7 @@ void ModelValidator::SqueezeNet(
   auto fullModelPath = modulePath + modelFileName;
   auto outputFileName = modulePath + outputDataFileName;
 
-  // WinML model creation
+        // WinML model creation
   LearningModel model = nullptr;
   model = LearningModel::LoadFromFilePath(fullModelPath);
 
@@ -201,13 +201,13 @@ void ModelValidator::SqueezeNet(
     BindTensor(modelBinding, inputBindingName, inputTensor, bindInputsAsIInspectable);
   }
 
-  // load up the expected output
+    // load up the expected output
   auto expectedResultsTensor = ProtobufHelpers::LoadTensorFromProtobufFile(outputFileName, false);
   if (expectedResultsTensor == nullptr) {
     throw winrt::hresult_invalid_argument(L"Expected Results from protobuf file are null.");
   }
 
-  // create the tensor for the actual output
+    // create the tensor for the actual output
   auto output = model.OutputFeatures().First().Current();
   if (output.Kind() != LearningModelFeatureKind::Tensor) {
     throw winrt::hresult_invalid_argument(L"Expected output feature kind of model to be Tensor");
@@ -216,15 +216,15 @@ void ModelValidator::SqueezeNet(
   auto outputTensor =
     BindOutput<TensorFloat>(outputBindingStrategy, modelBinding, outputDataBindingName, expectedResultsTensor.Shape());
 
-  // Evaluate the model
+    // Evaluate the model
   std::cout << "Calling EvaluateSync on instance " << instance << "\n";
   LearningModelEvaluationResult result = nullptr;
   result = modelSession.Evaluate(modelBinding, {});
 
-  // Get results
+    // Get results
   if (outputBindingStrategy == OutputBindingStrategy::Unbound) {
-    // When output binding strategy is unbound, the output tensor was not set on bind.
-    // Therefore, we need to retrieve it from the LearnignModelEvaluationResult
+        // When output binding strategy is unbound, the output tensor was not set on bind.
+        // Therefore, we need to retrieve it from the LearnignModelEvaluationResult
     outputTensor = result.Outputs().Lookup(outputDataBindingName).as<ITensor>();
   } else {
     if (result.Outputs().Lookup(outputDataBindingName) != outputTensor) {
diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
index 3a0a91fb7e220..9b389d014c953 100644
--- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
+++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -1140,7 +1140,6 @@ static void MsftQuantizedModels() {
   // load a model
   std::wstring filePath = FileHelpers::GetModulePath() + L"coreml_Resnet50_ImageNet-dq.onnx";
   LearningModel model = LearningModel::LoadFromFilePath(filePath);
-
   LearningModelSession session(model, LearningModelDevice(LearningModelDeviceKind::DirectX));
   // create a binding set
   LearningModelBinding binding(session);

From 1a0eaa663e636ff45c518cfe90ca75efa83323e6 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 7 Aug 2023 19:33:47 -0700
Subject: [PATCH 70/76] WIP

---
 winml/lib/Api/ImageFeatureValue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/winml/lib/Api/ImageFeatureValue.h b/winml/lib/Api/ImageFeatureValue.h
index 92f3cab43b432..83a21c8679cf3 100644
--- a/winml/lib/Api/ImageFeatureValue.h
+++ b/winml/lib/Api/ImageFeatureValue.h
@@ -1,4 +1,4 @@
-﻿﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #pragma once

From f98f2af797623a12686295b17a2e82331dc16ffb Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 8 Aug 2023 01:18:23 -0700
Subject: [PATCH 71/76] WIP

---
 .../onnxruntime/core/framework/execution_provider.h    |  7 +++++++
 .../dml/DmlExecutionProvider/src/ExecutionProvider.h   | 10 ++++++++++
 winml/adapter/winml_adapter_execution_provider.cpp     |  6 ++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index ea4f52f99649d..1a7e77cddee28 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -320,6 +320,13 @@ class IExecutionProvider {
     return default_device_;
   };
 
+  /**
+   * Return the appropriate OrtDevice object given OrtMemType that can be used directly by external callers.
+   */
+  virtual OrtDevice GetExternalOrtDeviceByMemType(OrtMemType mem_type) const {
+    return GetOrtDeviceByMemType(mem_type);
+  };
+
   /**
    * Create Preferred allocators for the current Execution Provider
    * This function is a stateless function which creates new instances of Allocator, without storing them in EP.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 74f56acb345ed..6ee1efc5df556 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -314,6 +314,16 @@ namespace Dml
             return m_impl->CreatePreferredAllocators();
         }
 
+        virtual OrtDevice GetExternalOrtDeviceByMemType(OrtMemType mem_type) const final
+        {
+            if (mem_type == OrtMemType::OrtMemTypeDefault)
+            {
+                return OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DML_EXTERNAL, 0);
+            }
+
+            return GetOrtDeviceByMemType(mem_type);
+        }
+
     private:
         ComPtr<ExecutionProviderImpl> m_impl;
     };
diff --git a/winml/adapter/winml_adapter_execution_provider.cpp b/winml/adapter/winml_adapter_execution_provider.cpp
index 52dbf9710abc7..0d3ae2f0d5ac4 100644
--- a/winml/adapter/winml_adapter_execution_provider.cpp
+++ b/winml/adapter/winml_adapter_execution_provider.cpp
@@ -51,7 +51,9 @@ ORT_API_STATUS_IMPL(
   auto inference_session = reinterpret_cast<::onnxruntime::InferenceSession*>(session);
   const auto execution_provider = reinterpret_cast<onnxruntime::IExecutionProvider*>(provider);
   OrtMemoryInfo mem_info(
-    "", OrtAllocatorType::OrtDeviceAllocator, execution_provider->GetOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault)
+    "",
+    OrtAllocatorType::OrtDeviceAllocator,
+    execution_provider->GetExternalOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault)
   );
   auto allocator_ptr = inference_session->GetAllocator(mem_info);
   *allocator = new (std::nothrow) OrtAllocatorWrapper(allocator_ptr);
@@ -66,7 +68,7 @@ ORT_API_STATUS_IMPL(winmla::GetProviderMemoryInfo, _In_ OrtExecutionProvider* pr
   API_IMPL_BEGIN
   const auto execution_provider = reinterpret_cast<onnxruntime::IExecutionProvider*>(provider);
 
-  auto device = execution_provider->GetOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault);
+  auto device = execution_provider->GetExternalOrtDeviceByMemType(::OrtMemType::OrtMemTypeDefault);
   *memory_info = new (std::nothrow) OrtMemoryInfo("", ::OrtAllocatorType::OrtDeviceAllocator, device);
   if (*memory_info == nullptr) {
     return OrtApis::CreateStatus(ORT_FAIL, "Out of memory");

From c54b29547874f3d3fd747fcc8d082472e77610b7 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 8 Aug 2023 10:53:02 -0700
Subject: [PATCH 72/76] Address PR comments

---
 .../src/ExecutionProvider.cpp                 |  32 +--
 .../src/Operators/DmlOperatorCopy.cpp         |   7 -
 .../DmlExecutionProvider/src/ReadbackHeap.cpp |  11 +-
 .../DmlExecutionProvider/src/ReadbackHeap.h   |   3 +-
 winml/lib/Common/inc/iengine.h                |   2 +-
 winml/test/adapter/AdapterDmlEpTest.cpp       | 201 ++++++++++--------
 winml/test/adapter/AdapterSessionTest.cpp     |  16 +-
 7 files changed, 139 insertions(+), 133 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 5c5f8ebf2c3d1..3fdf031cbc0c2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -515,8 +515,7 @@ namespace Dml
         ORT_THROW_HR_IF(E_INVALIDARG, dst.size() != src.size());
 
         // Source and destination for batched GPU -> CPU copies
-        std::vector<ID3D12Resource*> srcDatas;
-        std::vector<uint64_t> srcOffsets;
+        std::vector<D3D12BufferRegion> srcBufferRegions;
         std::vector<void*> dstDatas;
         std::vector<uint32_t> dataSizesInBytes;
 
@@ -545,19 +544,12 @@ namespace Dml
             ORT_THROW_HR_IF(E_INVALIDARG, dataSizesInBytes.back() != ComputeByteSizeFromTensor(*src[i])); // Tensors must be the same size
 
             dstDatas.push_back(dst[i]->GetData());
-
-            auto srcBufferRegion = GetBufferForTensor(src[i]);
-
-            ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-
-            srcDatas.push_back(srcData);
-            srcOffsets.push_back(srcBufferRegion.Offset());
+            srcBufferRegions.push_back(GetBufferForTensor(src[i]));
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
         const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcBufferRegions, srcState);
 
         return S_OK;
         }
@@ -924,11 +916,8 @@ namespace Dml
     Status ExecutionProviderImpl::CopyTensors(const std::vector<onnxruntime::IDataTransfer::SrcDstPair>& src_dst_pairs) const
     {
         // Source and destination for batched GPU -> CPU copies
-        std::vector<ID3D12Resource*> srcDatas;
-        srcDatas.reserve(src_dst_pairs.size());
-
-        std::vector<uint64_t> srcOffsets;
-        srcOffsets.reserve(src_dst_pairs.size());
+        std::vector<D3D12BufferRegion> srcBufferRegions;
+        srcBufferRegions.reserve(src_dst_pairs.size());
 
         std::vector<void*> dstDatas;
         dstDatas.reserve(src_dst_pairs.size());
@@ -973,19 +962,12 @@ namespace Dml
             ORT_THROW_HR_IF(E_INVALIDARG, dataSizesInBytes[i] != ComputeByteSizeFromTensor(srcWrapper)); // Tensors must be the same size
 
             dstDatas.push_back(dstWrapper.GetData());
-
-            auto srcBufferRegion = GetBufferForTensor(&srcWrapper);
-
-            ID3D12Resource* srcData = srcBufferRegion.GetD3D12Resource();
-            const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-
-            srcDatas.push_back(srcData);
-            srcOffsets.push_back(srcBufferRegion.Offset());
+            srcBufferRegions.push_back(GetBufferForTensor(&srcWrapper));
         }
 
         // Performs a blocking call to synchronize and read back data from the GPU into the destination buffer
         const auto srcState = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcDatas, srcOffsets, srcState);
+        m_readbackHeap->ReadbackFromGpu(dstDatas, dataSizesInBytes, srcBufferRegions, srcState);
 
         return onnxruntime::common::Status::OK();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
index 8fa3c74674776..96fec218ed87e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCopy.cpp
@@ -67,13 +67,6 @@ class DmlOperatorCopy : public DmlOperator
                 inputTensor.GetInterface().Get()));
         }
     }
-
-private:
-    // Aliasing means that both the input and the output start at the same exact offset in the same buffer
-    bool m_aliasing = false;
-
-    // The choice of using Identity or a copy depends on whether the input and the input are located in the same buffer
-    bool m_inputSharesOutputBuffer = false;
 };
 
 DML_OP_DEFINE_CREATION_FUNCTION(Copy, DmlOperatorCopy);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
index 5bb04ba4d30b5..268ad9a2b7a86 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.cpp
@@ -104,12 +104,11 @@ namespace Dml
     void ReadbackHeap::ReadbackFromGpu(
         gsl::span<void*> dst,
         gsl::span<const uint32_t > dstSizes,
-        gsl::span<ID3D12Resource*> src,
-        gsl::span<uint64_t> srcOffsets,
+        gsl::span<const D3D12BufferRegion> srcBufferRegions,
         D3D12_RESOURCE_STATES srcState)
     {
-        assert(dst.size() == src.size());
-        assert(dstSizes.size() == src.size());
+        assert(dst.size() == srcBufferRegions.size());
+        assert(dstSizes.size() == srcBufferRegions.size());
 
         if (dst.empty())
         {
@@ -132,8 +131,8 @@ namespace Dml
                 m_readbackHeap.Get(),
                 offset,
                 D3D12_RESOURCE_STATE_COPY_DEST,
-                src[i],
-                srcOffsets[i],
+                srcBufferRegions[i].GetD3D12Resource(),
+                srcBufferRegions[i].Offset(),
                 srcState,
                 dstSizes[i]);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
index 4a65ce899d791..bbc46cd0e0cb9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h
@@ -26,8 +26,7 @@ namespace Dml
         void ReadbackFromGpu(
             gsl::span<void*> dst,
             gsl::span<const uint32_t > dstSizes,
-            gsl::span<ID3D12Resource*> src,
-            gsl::span<uint64_t> srcOffsets,
+            gsl::span<const D3D12BufferRegion> srcBufferRegions,
             D3D12_RESOURCE_STATES srcState);
 
     private:
diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h
index 1aa857383a3b5..4451382114905 100644
--- a/winml/lib/Common/inc/iengine.h
+++ b/winml/lib/Common/inc/iengine.h
@@ -12,7 +12,7 @@ interface IEngineFactory;
 using Resource = std::unique_ptr<void, std::function<void(void*)>>;
 
 // clang-format off
-MIDL_INTERFACE("31f39226-cfe8-4758-af38-3d01b2a33ee1")
+MIDL_INTERFACE("8ac0b6b9-4561-492b-b63d-a07bdd8292c6")
 IValue : IUnknown {
   STDMETHOD(IsEmpty)
   (bool* out) PURE;
diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp
index 2b701d51aa73b..d8d5c708f3fb1 100644
--- a/winml/test/adapter/AdapterDmlEpTest.cpp
+++ b/winml/test/adapter/AdapterDmlEpTest.cpp
@@ -65,7 +65,7 @@ UniqueOrtSession CreateUniqueOrtSession(
   return UniqueOrtSession(session, ort_api->ReleaseSession);
 }
 
-UniqueOrtSession CreateDmlSession() {
+UniqueOrtSession CreateDmlSession(bool bfc_allocator_enabled) {
   const auto session_options = CreateUniqueOrtSessionOptions();
   THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api);
 
@@ -79,9 +79,10 @@ UniqueOrtSession CreateDmlSession() {
   command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
   WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put())));
 
+  constexpr bool metacommands_enabled = false;
   THROW_IF_NOT_OK_MSG(
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
-      session_options.get(), device.get(), queue.get(), false, true
+      session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled
     ),
     ort_api
   );
@@ -95,26 +96,35 @@ UniqueOrtSession CreateCpuSession() {
 
 void DmlExecutionProviderSetDefaultRoundingMode() {
   GPUTEST;
-  auto session = CreateDmlSession();
-  OrtExecutionProvider* ort_provider;
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderSetDefaultRoundingMode(ort_provider, false), ort_api);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+    OrtExecutionProvider* ort_provider;
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderSetDefaultRoundingMode(ort_provider, false), ort_api);
+  }
 }
 
 void DmlExecutionProviderFlushContext() {
   GPUTEST;
-  auto session = CreateDmlSession();
-  OrtExecutionProvider* ort_provider;
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderFlushContext(ort_provider), ort_api);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+    OrtExecutionProvider* ort_provider;
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderFlushContext(ort_provider), ort_api);
+  }
 }
 
 void DmlExecutionProviderReleaseCompletedReferences() {
   GPUTEST;
-  auto session = CreateDmlSession();
-  OrtExecutionProvider* ort_provider;
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderReleaseCompletedReferences(ort_provider), ort_api);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+    OrtExecutionProvider* ort_provider;
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderReleaseCompletedReferences(ort_provider), ort_api);
+  }
 }
 
 constexpr std::array<int64_t, 4> dimensions{1, 3, 720, 720};
@@ -175,29 +185,32 @@ void DmlGetD3D12ResourceFromAllocation() {
   void* gpu_allocation;
   THROW_IF_NOT_OK_MSG(ort_dml_api->CreateGPUAllocationFromD3DResource(d3d12_resource.get(), &gpu_allocation), ort_api);
 
-  auto session = CreateDmlSession();
-
-  OrtMemoryInfo* ort_memory_info;
-  THROW_IF_NOT_OK_MSG(
-    ort_api->CreateMemoryInfo(
-      "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info
-    ),
-    ort_api
-  );
-
-  OrtAllocator* ort_allocator;
-  THROW_IF_NOT_OK_MSG(ort_api->CreateAllocator(session.get(), ort_memory_info, &ort_allocator), ort_api);
-  auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator);
-
-  winrt::com_ptr<ID3D12Resource> d3d12_resource_from_allocation;
-  THROW_IF_NOT_OK_MSG(
-    ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()),
-    ort_api
-  );
-   // Ensure resource is the same
-  WINML_EXPECT_EQUAL(d3d12_resource, d3d12_resource_from_allocation);
-
-  THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+
+    OrtMemoryInfo* ort_memory_info;
+    THROW_IF_NOT_OK_MSG(
+      ort_api->CreateMemoryInfo(
+        "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info
+      ),
+      ort_api
+    );
+
+    OrtAllocator* ort_allocator;
+    THROW_IF_NOT_OK_MSG(ort_api->CreateAllocator(session.get(), ort_memory_info, &ort_allocator), ort_api);
+    auto allocator = UniqueOrtAllocator(ort_allocator, ort_api->ReleaseAllocator);
+
+    winrt::com_ptr<ID3D12Resource> d3d12_resource_from_allocation;
+    THROW_IF_NOT_OK_MSG(
+      ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()),
+      ort_api
+    );
+    // Ensure resource is the same
+    WINML_EXPECT_EQUAL(d3d12_resource, d3d12_resource_from_allocation);
+
+    THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api);
+  }
 }
 
 UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) {
@@ -219,28 +232,34 @@ UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) {
 
 void GetTensorMemoryInfo() {
   GPUTEST;
-  auto session = CreateDmlSession();
-
-  OrtMemoryInfo* ort_memory_info;
-  THROW_IF_NOT_OK_MSG(
-    ort_api->CreateMemoryInfo(
-      "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info
-    ),
-    ort_api
-  );
-  auto tensor = CreateTensorFromMemoryInfo(ort_memory_info);
-
-  const OrtMemoryInfo* value_memory_info;
-  THROW_IF_NOT_OK_MSG(ort_api->GetTensorMemoryInfo(tensor.get(), &value_memory_info), ort_api);
-  CreateTensorFromMemoryInfo(value_memory_info);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+
+    OrtMemoryInfo* ort_memory_info;
+    THROW_IF_NOT_OK_MSG(
+      ort_api->CreateMemoryInfo(
+        "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info
+      ),
+      ort_api
+    );
+    auto tensor = CreateTensorFromMemoryInfo(ort_memory_info);
+
+    const OrtMemoryInfo* value_memory_info;
+    THROW_IF_NOT_OK_MSG(ort_api->GetTensorMemoryInfo(tensor.get(), &value_memory_info), ort_api);
+    CreateTensorFromMemoryInfo(value_memory_info);
+  }
 }
 
 void ExecutionProviderSync() {
   GPUTEST;
-  auto session = CreateDmlSession();
-  OrtExecutionProvider* ort_provider;
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->ExecutionProviderSync(ort_provider), ort_api);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+    OrtExecutionProvider* ort_provider;
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->ExecutionProviderSync(ort_provider), ort_api);
+  }
 }
 
 void DmlCopyTensor() {
@@ -258,9 +277,11 @@ void DmlCopyTensor() {
   command_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
   WINML_EXPECT_HRESULT_SUCCEEDED(device->CreateCommandQueue(&command_queue_desc, IID_PPV_ARGS(queue.put())));
 
+  constexpr bool metacommands_enabled = false;
+  constexpr bool bfc_allocator_enabled = true;
   THROW_IF_NOT_OK_MSG(
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
-      session_options.get(), device.get(), queue.get(), false, true
+      session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled
     ),
     ort_api
   );
@@ -322,41 +343,47 @@ void CreateCustomRegistry() {
 
 void ValueGetDeviceId() {
   GPUTEST;
-  auto session = CreateDmlSession();
-
-  OrtMemoryInfo* ort_memory_info;
-  THROW_IF_NOT_OK_MSG(
-    ort_api->CreateMemoryInfo(
-      "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info
-    ),
-    ort_api
-  );
-  auto gpu_tensor = CreateTensorFromMemoryInfo(ort_memory_info);
-
-  int16_t device_id;
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(gpu_tensor.get(), &device_id), ort_api);
-
-  OrtMemoryInfo* cpu_memory_info;
-  THROW_IF_NOT_OK_MSG(ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info), ort_api);
-  auto unique_cpu_memory_info = UniqueOrtMemoryInfo(cpu_memory_info, ort_api->ReleaseMemoryInfo);
-  auto cpu_tensor = CreateTensorFromMemoryInfo(unique_cpu_memory_info.get());
-  THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(cpu_tensor.get(), &device_id), ort_api);
-  WINML_EXPECT_EQUAL(0, device_id);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+
+    OrtMemoryInfo* ort_memory_info;
+    THROW_IF_NOT_OK_MSG(
+      ort_api->CreateMemoryInfo(
+        "DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault, &ort_memory_info
+      ),
+      ort_api
+    );
+    auto gpu_tensor = CreateTensorFromMemoryInfo(ort_memory_info);
+
+    int16_t device_id;
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(gpu_tensor.get(), &device_id), ort_api);
+
+    OrtMemoryInfo* cpu_memory_info;
+    THROW_IF_NOT_OK_MSG(ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info), ort_api);
+    auto unique_cpu_memory_info = UniqueOrtMemoryInfo(cpu_memory_info, ort_api->ReleaseMemoryInfo);
+    auto cpu_tensor = CreateTensorFromMemoryInfo(unique_cpu_memory_info.get());
+    THROW_IF_NOT_OK_MSG(winml_adapter_api->ValueGetDeviceId(cpu_tensor.get(), &device_id), ort_api);
+    WINML_EXPECT_EQUAL(0, device_id);
+  }
 }
 
 void SessionGetInputRequiredDeviceId() {
   GPUTEST;
-  auto session = CreateDmlSession();
-  int16_t device_id;
-  THROW_IF_NOT_OK_MSG(
-    winml_adapter_api->SessionGetInputRequiredDeviceId(session.get(), "inputImage", &device_id), ort_api
-  );
-
-  auto cpu_session = CreateCpuSession();
-  THROW_IF_NOT_OK_MSG(
-    winml_adapter_api->SessionGetInputRequiredDeviceId(cpu_session.get(), "inputImage", &device_id), ort_api
-  );
-  WINML_EXPECT_EQUAL(0, device_id);
+  for (bool bfc_allocator_enabled : {false, true})
+  {
+    auto session = CreateDmlSession(bfc_allocator_enabled);
+    int16_t device_id;
+    THROW_IF_NOT_OK_MSG(
+      winml_adapter_api->SessionGetInputRequiredDeviceId(session.get(), "inputImage", &device_id), ort_api
+    );
+
+    auto cpu_session = CreateCpuSession();
+    THROW_IF_NOT_OK_MSG(
+      winml_adapter_api->SessionGetInputRequiredDeviceId(cpu_session.get(), "inputImage", &device_id), ort_api
+    );
+    WINML_EXPECT_EQUAL(0, device_id);
+  }
 }
 }// namespace
 
diff --git a/winml/test/adapter/AdapterSessionTest.cpp b/winml/test/adapter/AdapterSessionTest.cpp
index eb62c30fdeb8e..aaeb8a0b711d0 100644
--- a/winml/test/adapter/AdapterSessionTest.cpp
+++ b/winml/test/adapter/AdapterSessionTest.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "testPch.h"
 
@@ -103,9 +103,11 @@ void AppendExecutionProvider_DML() {
 
   const auto device = CreateD3DDevice();
   const auto queue = CreateD3DQueue(device.get());
+  constexpr bool metacommands_enabled = true;
+  constexpr bool bfc_allocator_enabled = true;
   THROW_IF_NOT_OK_MSG(
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
-      session_options.get(), device.get(), queue.get(), true, true
+      session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled
     ),
     ort_api
   );
@@ -130,9 +132,11 @@ void GetExecutionProvider_DML() {
   THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api);
   const auto device = CreateD3DDevice();
   const auto queue = CreateD3DQueue(device.get());
+  constexpr bool metacommands_enabled = true;
+  constexpr bool bfc_allocator_enabled = true;
   THROW_IF_NOT_OK_MSG(
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
-      session_options.get(), device.get(), queue.get(), true, true
+      session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled
     ),
     ort_api
   );
@@ -142,7 +146,7 @@ void GetExecutionProvider_DML() {
 
   OrtExecutionProvider* ort_provider;
   THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
-   // Test if DML EP method can be called
+  // Test if DML EP method can be called
   THROW_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderFlushContext(ort_provider), ort_api);
 }
 
@@ -290,9 +294,11 @@ void CopyInputAcrossDevices_DML() {
   THROW_IF_NOT_OK_MSG(ort_api->DisableMemPattern(session_options.get()), ort_api);
   const auto device = CreateD3DDevice();
   const auto queue = CreateD3DQueue(device.get());
+  constexpr bool metacommands_enabled = true;
+  constexpr bool bfc_allocator_enabled = true;
   THROW_IF_NOT_OK_MSG(
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_DML(
-      session_options.get(), device.get(), queue.get(), true, true
+      session_options.get(), device.get(), queue.get(), metacommands_enabled, bfc_allocator_enabled
     ),
     ort_api
   );

From 26b4e7e81cc61c073340be0a83d4a7348d6c5911 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 8 Aug 2023 17:21:32 -0700
Subject: [PATCH 73/76] Move allocation free outside of loop

---
 winml/test/adapter/AdapterDmlEpTest.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp
index d8d5c708f3fb1..3b6888c3db576 100644
--- a/winml/test/adapter/AdapterDmlEpTest.cpp
+++ b/winml/test/adapter/AdapterDmlEpTest.cpp
@@ -208,9 +208,9 @@ void DmlGetD3D12ResourceFromAllocation() {
     );
     // Ensure resource is the same
     WINML_EXPECT_EQUAL(d3d12_resource, d3d12_resource_from_allocation);
-
-    THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api);
   }
+
+  THROW_IF_NOT_OK_MSG(ort_dml_api->FreeGPUAllocation(gpu_allocation), ort_api);
 }
 
 UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) {

From 163fe5b38d100ae490851d95a5bdf55d2253adfa Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 9 Aug 2023 11:31:21 -0700
Subject: [PATCH 74/76] Fix linting errors

---
 winml/test/adapter/AdapterDmlEpTest.cpp | 36 +++++++++++--------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/winml/test/adapter/AdapterDmlEpTest.cpp b/winml/test/adapter/AdapterDmlEpTest.cpp
index 3b6888c3db576..6903e9f1eaca8 100644
--- a/winml/test/adapter/AdapterDmlEpTest.cpp
+++ b/winml/test/adapter/AdapterDmlEpTest.cpp
@@ -1,5 +1,5 @@
-// // Copyright (c) Microsoft Corporation. All rights reserved.
- // // Licensed under the MIT License.
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
 
 #include "testPch.h"
 
@@ -96,8 +96,7 @@ UniqueOrtSession CreateCpuSession() {
 
 void DmlExecutionProviderSetDefaultRoundingMode() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
     OrtExecutionProvider* ort_provider;
     THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
@@ -107,8 +106,7 @@ void DmlExecutionProviderSetDefaultRoundingMode() {
 
 void DmlExecutionProviderFlushContext() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
     OrtExecutionProvider* ort_provider;
     THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
@@ -118,8 +116,7 @@ void DmlExecutionProviderFlushContext() {
 
 void DmlExecutionProviderReleaseCompletedReferences() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
     OrtExecutionProvider* ort_provider;
     THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
@@ -185,8 +182,7 @@ void DmlGetD3D12ResourceFromAllocation() {
   void* gpu_allocation;
   THROW_IF_NOT_OK_MSG(ort_dml_api->CreateGPUAllocationFromD3DResource(d3d12_resource.get(), &gpu_allocation), ort_api);
 
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
 
     OrtMemoryInfo* ort_memory_info;
@@ -203,7 +199,9 @@ void DmlGetD3D12ResourceFromAllocation() {
 
     winrt::com_ptr<ID3D12Resource> d3d12_resource_from_allocation;
     THROW_IF_NOT_OK_MSG(
-      ort_dml_api->GetD3D12ResourceFromAllocation(allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()),
+      ort_dml_api->GetD3D12ResourceFromAllocation(
+        allocator.get(), gpu_allocation, d3d12_resource_from_allocation.put()
+      ),
       ort_api
     );
     // Ensure resource is the same
@@ -232,8 +230,7 @@ UniqueOrtValue CreateTensorFromMemoryInfo(const OrtMemoryInfo* memory_info) {
 
 void GetTensorMemoryInfo() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
 
     OrtMemoryInfo* ort_memory_info;
@@ -253,8 +250,7 @@ void GetTensorMemoryInfo() {
 
 void ExecutionProviderSync() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
     OrtExecutionProvider* ort_provider;
     THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &ort_provider), ort_api);
@@ -290,7 +286,7 @@ void DmlCopyTensor() {
   OrtExecutionProvider* dml_provider;
   THROW_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session.get(), 0, &dml_provider), ort_api);
 
-    // CPU to CPU is not supported
+  // CPU to CPU is not supported
   OrtMemoryInfo* cpu_memory_info;
   THROW_IF_NOT_OK_MSG(ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info), ort_api);
   auto cpu_tensor = CreateTensorFromMemoryInfo(cpu_memory_info);
@@ -299,7 +295,7 @@ void DmlCopyTensor() {
     nullptr, winml_adapter_api->DmlCopyTensor(dml_provider, cpu_tensor.get(), dst_cpu_tensor.get())
   );
 
-    // GPU to CPU
+  // GPU to CPU
   OrtMemoryInfo* ort_memory_info;
   THROW_IF_NOT_OK_MSG(
     ort_api->CreateMemoryInfo(
@@ -343,8 +339,7 @@ void CreateCustomRegistry() {
 
 void ValueGetDeviceId() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
 
     OrtMemoryInfo* ort_memory_info;
@@ -370,8 +365,7 @@ void ValueGetDeviceId() {
 
 void SessionGetInputRequiredDeviceId() {
   GPUTEST;
-  for (bool bfc_allocator_enabled : {false, true})
-  {
+  for (bool bfc_allocator_enabled : {false, true}) {
     auto session = CreateDmlSession(bfc_allocator_enabled);
     int16_t device_id;
     THROW_IF_NOT_OK_MSG(

From e6ae0587905801833d855ef855428a2ef19f2292 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 15 Aug 2023 21:43:15 -0700
Subject: [PATCH 75/76] Address PR comments

---
 onnxruntime/core/framework/bfc_arena.cc        |  3 +--
 .../src/DmlCommandRecorder.cpp                 |  1 -
 ...h => DmlReservedResourceAllocatorWrapper.h} |  4 ++--
 .../src/DmlReservedResourceSubAllocator.h      |  4 ----
 .../src/ExecutionProvider.cpp                  | 18 ++----------------
 .../src/ExecutionProvider.h                    |  3 ---
 .../src/IExecutionProvider.h                   |  2 --
 .../src/MLOperatorAuthorImpl.cpp               |  4 +++-
 .../src/MLOperatorAuthorImpl.h                 |  8 +++-----
 .../MLOperatorAuthorPrivate.h                  |  5 -----
 .../core/providers/dml/dml_provider_factory.cc |  1 -
 11 files changed, 11 insertions(+), 42 deletions(-)
 rename onnxruntime/core/providers/dml/DmlExecutionProvider/src/{DmlBfcAllocator.h => DmlReservedResourceAllocatorWrapper.h} (81%)

diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
index 99288e6167ca7..9d58bf52de3e6 100644
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@@ -41,8 +41,7 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
   memory_limit_ = total_memory;
   stats_.bytes_limit = static_cast<int64_t>(total_memory);
 
-  arena_extend_strategy_ = arena_extend_strategy;
-  UpdateFirstAllocationShrinkageLogic();
+  SetArenaExtendStrategy(arena_extend_strategy);
 
   // Create a bunch of bins of various good sizes.
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 862884c22b08c..0d2e5d1740bcc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -26,7 +26,6 @@ void DmlCommandRecorder::SetAllocator(std::weak_ptr<DmlGpuAllocator> allocator)
     m_allocator = allocator;
 }
 
-
 void DmlCommandRecorder::InitializeOperator(
     IDMLCompiledOperator* op,
     const DML_BINDING_DESC& persistentResourceBinding,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceAllocatorWrapper.h
similarity index 81%
rename from onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
rename to onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceAllocatorWrapper.h
index d8631c1e9c1d0..e92740e9ce907 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlBfcAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceAllocatorWrapper.h
@@ -8,10 +8,10 @@
 
 namespace Dml
 {
-    class DmlBfcAllocator : public onnxruntime::IAllocator
+    class DmlReservedResourceAllocatorWrapper : public onnxruntime::IAllocator
     {
     public:
-        DmlBfcAllocator(std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator)
+        DmlReservedResourceAllocatorWrapper(std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator)
         : onnxruntime::IAllocator(
             OrtMemoryInfo(
                 onnxruntime::DML,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
index f1c71c6313dac..f705b1d3ca4b8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlReservedResourceSubAllocator.h
@@ -10,10 +10,6 @@
 
 namespace Dml
 {
-    class DmlReservedResourceSubAllocator;
-    class DmlReservedResourceSubAllocator;
-    struct TaggedPointer;
-
     // An allocator that makes logically contiguous allocations backed by D3D heaps.
     //
     // Heaps must fit entirely in either local or non-local memory. Larger heaps
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 3e701832e3ca6..f94a03add7a2f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -22,7 +22,7 @@
 #include "core/framework/bfc_arena.h"
 #include "DmlCommittedResourceWrapper.h"
 #include "DmlBufferRegion.h"
-#include "DmlBfcAllocator.h"
+#include "DmlReservedResourceAllocatorWrapper.h"
 #include "DmlGpuAllocator.h"
 #include "DmlBuffer.h"
 #include "DmlTaggedPointer.h"
@@ -127,18 +127,6 @@ namespace Dml
         return tensorWrapper->GetBufferRegion();
     }
 
-    ID3D12Resource* __stdcall ExecutionProviderImpl::DecodeResource(IMLOperatorTensor* tensor) const noexcept
-    {
-        ORT_TRY
-        {
-            return GetBufferForTensor(tensor).GetD3D12Resource();
-        }
-        ORT_CATCH_GENERIC
-        {
-            return nullptr;
-        }
-    }
-
 // ORT release pipelines agent pools do not have 19H1 SDK installed which defines D3D_FEATURE_LEVEL_1_0_CORE.
 // Once ORT/WinML github project can be built with VS2019, we can update these pools to use install the 19H1 SDK
 // using the command line installer tool with VS2019
@@ -199,10 +187,8 @@ namespace Dml
 
     static std::shared_ptr<onnxruntime::BFCArena> CreateBfcAllocator(std::shared_ptr<DmlReservedResourceSubAllocator> subAllocator)
     {
-        auto device_allocator = std::make_unique<DmlBfcAllocator>(subAllocator);
-
         auto bfcArena = std::make_unique<onnxruntime::BFCArena>(
-            std::move(device_allocator),
+            std::make_unique<DmlReservedResourceAllocatorWrapper>(subAllocator),
             onnxruntime::BFCArena::DEFAULT_MAX_MEM,
             onnxruntime::ArenaExtendStrategy::kSameAsRequested,
             onnxruntime::BFCArena::DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index ba825a9efa919..fb91a2ce44693 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -30,7 +30,6 @@ namespace Dml
     class PooledUploadHeap;
     class ReadbackHeap;
     class ExecutionContext;
-    class DmlReservedResourceSubAllocator;
     class BucketizedBufferAllocator;
     class DmlCpuAllocator;
     class ExecutionProvider;
@@ -137,8 +136,6 @@ namespace Dml
         // Allocate a resource from pools.  Releasing the returned buffer returns it to the pool.
         DmlBuffer ExecutionProviderImpl::AllocatePooledResource(size_t size, AllocatorRoundingMode roundingMode) const;
 
-        STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept final;
-
         std::shared_ptr<onnxruntime::KernelRegistry> GetKernelRegistry() const
         {
             return m_kernelRegistry;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
index 967d0cb8e6ed6..8f44694dcf7e6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
@@ -69,8 +69,6 @@ namespace Dml
         STDMETHOD_(D3D12_COMMAND_LIST_TYPE, GetCommandListTypeForQueue)() const noexcept = 0;
         STDMETHOD_(void, Flush)() const noexcept = 0;
 
-        STDMETHOD_(ID3D12Resource*, DecodeResource)(IMLOperatorTensor* tensor) const noexcept = 0;
-
         STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0;
         STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 4b749acf4ae33..0ed3cf4005aa6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -557,7 +557,7 @@ namespace Windows::AI::MachineLearning::Adapter
         const AttributeMap* defaultAttributes,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter,
-        onnxruntime::OpKernelContext* kernelContext
+        const onnxruntime::OpKernelContext* kernelContext
         )
     :   OpNodeInfoWrapper(kerneInfo, inputShapeOverrides, defaultAttributes, requiredConstantCpuInputs, constantInputGetter, kernelContext),
         m_inferredOutputShapes(inferredOutputShapes),
@@ -1806,6 +1806,8 @@ namespace Windows::AI::MachineLearning::Adapter
             {
                 m_winmlProvider->GetABIExecutionInterfaceAndInvalidateState(isInternalOperator, m_abiExecutionObject.ReleaseAndGetAddressOf());
             }
+
+            TransitionResourcesForOperatorIfRequired(true);
         }
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 4f982c80c4c5c..b382a42b39c42 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -176,7 +176,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable
         const AttributeMap* defaultAttributes,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter,
-        onnxruntime::OpKernelContext* kernelContext = nullptr
+        const onnxruntime::OpKernelContext* kernelContext = nullptr
         )
     :   m_impl(impl),
         m_kernelContext(kernelContext),
@@ -245,7 +245,7 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable
  protected:
     // Lifetime is managed by the caller and guaranteed to outlive this class
     const onnxruntime::OpNodeProtoHelper<NodeInfoImpl_t>* m_impl = nullptr;
-    mutable onnxruntime::OpKernelContext* m_kernelContext = nullptr;
+    const onnxruntime::OpKernelContext* m_kernelContext = nullptr;
 
  private:
     template <MLOperatorAttributeType T>
@@ -304,8 +304,6 @@ class TensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
 
     void* m_tensorData = nullptr;
     bool m_isDataInterface = false;
-
-    ID3D12Resource* m_abiDataInterface;
 };
 
 class OnnxTensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
@@ -362,7 +360,7 @@ class OpKernelInfoWrapper : public OpNodeInfoWrapper<
         const AttributeMap* defaultAttributes,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter,
-        onnxruntime::OpKernelContext* kernelContext = nullptr
+        const onnxruntime::OpKernelContext* kernelContext = nullptr
     );
 
     // HasTensorShapeDescription returns false if and only if the kernel is registered using
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index 9b4536b6218b2..9909be1f8337f 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -10,11 +10,6 @@ struct DML_INPUT_GRAPH_EDGE_DESC;
 struct DML_OUTPUT_GRAPH_EDGE_DESC;
 struct DML_INTERMEDIATE_GRAPH_EDGE_DESC;
 
-namespace onnxruntime
-{
-    class TensorShape;
-}
-
 // Either nodesAsOpDesc or nodesAsIDMLOperator is present.
 //  1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC.
 //     These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 1f30d7be5cf27..e4fdbdcb858c7 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -90,7 +90,6 @@ void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* f
   dml_provider_factory->SetBfcAllocatorEnabled(bfc_allocator_enabled);
 }
 
-
 bool IsSoftwareAdapter(IDXGIAdapter1* adapter) {
     DXGI_ADAPTER_DESC1 desc;
     adapter->GetDesc1(&desc);

From 01d9bd27dd7b368edc17012de4fe2262b9f08127 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 16 Aug 2023 07:02:39 -0700
Subject: [PATCH 76/76] Fix lint issues

---
 winml/adapter/winml_adapter_dml.cpp           |  2 +-
 .../Api.Ort/OnnxruntimeDmlSessionBuilder.cpp  |  4 +--
 .../Api.Ort/OnnxruntimeDmlSessionBuilder.h    |  2 +-
 .../lib/Api.Ort/OnnxruntimeEngineBuilder.cpp  |  2 +-
 winml/lib/Api/LearningModelSession.cpp        | 28 +++++++++----------
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp
index 18efff94c60c8..0c4c451f4ed39 100644
--- a/winml/adapter/winml_adapter_dml.cpp
+++ b/winml/adapter/winml_adapter_dml.cpp
@@ -70,7 +70,7 @@ Microsoft::WRL::ComPtr<IDMLDevice> CreateDmlDevice(ID3D12Device* d3d12Device) {
 namespace onnxruntime {
 void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled);
 void DmlConfigureProviderFactoryBfcAllocatorEnabled(IExecutionProviderFactory* factory, bool bfc_allocator_enabled);
-} // namespace onnxruntime
+}  // namespace onnxruntime
 
 #endif  // USE_DML
 
diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
index d9f8880349755..9de5585e4ba78 100644
--- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "lib/Api.Ort/pch.h"
 
@@ -64,7 +64,7 @@ OnnxruntimeDmlSessionBuilder::CreateSessionOptions(OrtSessionOptions** options)
     winml_adapter_api->OrtSessionOptionsAppendExecutionProvider_CPU(session_options.get(), use_arena), ort_api
   );
 
-    // call release() so the underlying OrtSessionOptions object isn't freed
+  // call release() so the underlying OrtSessionOptions object isn't freed
   *options = session_options.release();
 
   return S_OK;
diff --git a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h
index 659f936cbcfff..3b1ade796d80f 100644
--- a/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h
+++ b/winml/lib/Api.Ort/OnnxruntimeDmlSessionBuilder.h
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #pragma once
 
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp
index e21f6836e7c4a..a055b1b02ef64 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngineBuilder.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "lib/Api.Ort/pch.h"
 
diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp
index f362dbcded26f..922420d997f6e 100644
--- a/winml/lib/Api/LearningModelSession.cpp
+++ b/winml/lib/Api/LearningModelSession.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
- // Licensed under the MIT License.
+// Licensed under the MIT License.
 
 #include "lib/Api/pch/pch.h"
 
@@ -20,8 +20,8 @@
 static const auto c_enable_debug_output = L"EnableDebugOutput";
 
 namespace guid_details {
- // This GUID is to be used for delimiting ML-related categories of capturable work.
- // {D113B493-BBA2-4993-8608-D706A73B91CE}
+// This GUID is to be used for delimiting ML-related categories of capturable work.
+// {D113B493-BBA2-4993-8608-D706A73B91CE}
 struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE")) __declspec(novtable
 ) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {};
 }  // namespace guid_details
@@ -61,7 +61,7 @@ LearningModelSession::LearningModelSession(
 WINML_CATCH_ALL
 
 _winml::IModel* LearningModelSession::GetOptimizedModel() {
-   // Get the model proto
+  // Get the model proto
 
   auto should_close_model = session_options_ != nullptr && session_options_.CloseModelOnSessionCreation();
 
@@ -72,18 +72,18 @@ _winml::IModel* LearningModelSession::GetOptimizedModel(bool should_close_model)
   com_ptr<_winml::IModel> model;
 
   {
-     // Lock the model detach/copy since multiple threads can access concurrently
+    // Lock the model detach/copy since multiple threads can access concurrently
     CWinMLAutoLock lock(&session_creation_lock_);
 
-      // Throw if the model has been disposed and is not capable of creating
-     // new sessions.
+    // Throw if the model has been disposed and is not capable of creating
+    // new sessions.
     auto model_impl = model_.as<winmlp::LearningModel>();
     WINML_THROW_HR_IF_TRUE_MSG(E_INVALIDARG, model_impl->IsDisposed(), "The model has been disposed.");
 
     model.attach(should_close_model ? model_impl->DetachModel() : model_impl->CloneModel());
   }
 
-    // Ensure that the model is runnable on the device
+  // Ensure that the model is runnable on the device
   auto isFloat16Supported = device_.as<winmlp::LearningModelDevice>()->GetD3DDeviceCache()->IsFloat16Supported();
   if (!isFloat16Supported) {
     WINML_THROW_IF_FAILED(model->ModelEnsureNoFloat16());
@@ -92,13 +92,13 @@ _winml::IModel* LearningModelSession::GetOptimizedModel(bool should_close_model)
 }
 
 void LearningModelSession::Initialize() {
-   // Begin recording session creation telemetry
+  // Begin recording session creation telemetry
   _winmlt::TelemetryEvent session_creation_event(_winmlt::EventCategory::kSessionCreation);
-   // Get the optimized model proto from the learning model
+  // Get the optimized model proto from the learning model
   com_ptr<_winml::IModel> model;
   model.attach(GetOptimizedModel());
 
-    // Create the session builder
+  // Create the session builder
   auto device_impl = device_.as<winmlp::LearningModelDevice>();
   auto model_impl = model_.as<winmlp::LearningModel>();
 
@@ -121,7 +121,7 @@ void LearningModelSession::Initialize() {
 
   auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads();
   auto allow_spinning = device_impl->AllowSpinning();
-   // Make onnxruntime apply the batch size override, if any
+  // Make onnxruntime apply the batch size override, if any
   if (session_options_) {
     if (session_options_.BatchSizeOverride() != 0) {
       WINML_THROW_IF_FAILED(engine_builder->SetBatchSizeOverride(session_options_.BatchSizeOverride()));
@@ -130,7 +130,7 @@ void LearningModelSession::Initialize() {
     com_ptr<winmlp::LearningModelSessionOptions> session_options_impl =
       session_options_.as<winmlp::LearningModelSessionOptions>();
 
-      // Make onnxruntime apply named dimension overrides, if any
+    // Make onnxruntime apply named dimension overrides, if any
     if (session_options_impl && session_options_impl->NamedDimensionOverrides().Size() > 0) {
       WINML_THROW_IF_FAILED(engine_builder->SetNamedDimensionOverrides(session_options_impl->NamedDimensionOverrides())
       );
@@ -164,7 +164,7 @@ void LearningModelSession::Initialize() {
   com_ptr<_winml::IEngine> engine;
   WINML_THROW_IF_FAILED(engine_builder->CreateEngine(engine.put()));
 
-    // Register the custom operator registry
+  // Register the custom operator registry
   operator_registry_ =
     MLOperatorRegistry(model_impl->GetOperatorRegistry(), [](auto registry) { registry->Release(); });
   WINML_THROW_IF_FAILED(engine->RegisterCustomRegistry(operator_registry_.get()));