diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 68b9b3fe5706f..334a40b979bda 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -182,7 +182,7 @@ namespace Dml
         }
         else
         {
-            if (!m_closed)
+            if (!m_context->IsClosed())
             {
                 // Free the underlying allocation once queued work has completed.
     #ifdef _GAMING_XBOX
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 65bc9b7f69316..16283d5b19c9c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -46,11 +46,6 @@ namespace Dml
 
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
 
-        void Close()
-        {
-            m_closed = true;
-        }
-
     public: // onnxruntime::IAllocator
         void* Alloc(size_t size, AllocatorRoundingMode roundingMode);
         void* Alloc(size_t size) final;
@@ -88,7 +83,6 @@ namespace Dml
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
-        bool m_closed = false;
 
         // Unless specifically requested, allocation sizes are not rounded to enable pooling
         // until SetDefaultRoundingMode is called.  This should be done at completion of session
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
index 67faf333d21e1..988324bab1174 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -55,7 +55,7 @@ namespace Dml
         // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference
         // to its underlying D3D resource when freed. Furthermore, these references are unnecessary
         // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences.
-        if (!m_clearingQueue)
+        if (!m_closing)
         {
             QueuedReference queuedReference = {GetLastFenceValue(), object};
 
@@ -70,15 +70,15 @@ namespace Dml
         }
     }
 
-    void CommandQueue::WaitForSignalAndClearQueue()
+    void CommandQueue::Close()
     {
         // Wait for flushed work:
-        assert(!m_clearingQueue);
-        m_clearingQueue = true;
+        assert(!m_closing);
+        m_closing = true;
         GpuEvent event = GetCurrentCompletionEvent();
         event.WaitForSignal(m_cpuSyncSpinningEnabled);
         m_queuedReferences.clear();
-        m_clearingQueue = false;
+        m_closing = false;
     }
 
     void CommandQueue::ReleaseCompletedReferences()
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
index 9a4728d5845d4..71d5eb173cfec 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
@@ -44,7 +44,7 @@ namespace Dml
         }
 #endif
 
-        void WaitForSignalAndClearQueue();
+        void Close();
         void ReleaseCompletedReferences();
 
     private:
@@ -61,7 +61,7 @@ namespace Dml
 
         ComPtr<ID3D12Fence> m_fence;
         uint64_t m_lastFenceValue = 0;
-        bool m_clearingQueue = false;
+        bool m_closing = false;
         bool m_cpuSyncSpinningEnabled = false;
     };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index ececf13fc8cdf..5dc1213bd76f0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -11,10 +11,13 @@ namespace Dml
         ID3D12Device* d3d12Device,
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* queue,
-        bool cpuSyncSpinningEnabled)
+        bool cpuSyncSpinningEnabled,
+        bool keepOpen
+        )
         : m_queue(std::make_shared<CommandQueue>(queue, cpuSyncSpinningEnabled))
         , m_dmlRecorder(d3d12Device, dmlDevice, m_queue)
         , m_cpuSyncSpinningEnabled(cpuSyncSpinningEnabled)
+        , m_keepOpen(keepOpen)
     {
         ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));
     }
@@ -33,6 +36,8 @@ namespace Dml
         D3D12_RESOURCE_STATES srcState,
         uint64_t byteCount)
     {
+        assert(!m_closed);
+
         SetCommandRecorder(&m_dmlRecorder);
 
         std::vector<D3D12_RESOURCE_BARRIER> barriers;
@@ -79,6 +84,8 @@ namespace Dml
         _Out_ uint64_t* completionValue
         )
     {
+        assert(!m_closed);
+
         SetCommandRecorder(&m_dmlRecorder);
         m_dmlRecorder.ExecuteCommandList(commandList, fence, completionValue);
     }
@@ -88,6 +95,7 @@ namespace Dml
         const DML_BINDING_DESC& persistentResourceBinding,
         const DML_BINDING_DESC& inputArrayBinding)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.InitializeOperator(op, persistentResourceBinding, inputArrayBinding);
@@ -99,6 +107,7 @@ namespace Dml
         gsl::span<const DML_BINDING_DESC> inputBindings,
         gsl::span<const DML_BINDING_DESC> outputBindings)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.ExecuteOperator(op, persistentResourceBinding, inputBindings, outputBindings);
@@ -106,6 +115,7 @@ namespace Dml
 
     void ExecutionContext::AddUAVBarrier()
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.AddUAVBarrier();
@@ -113,6 +123,7 @@ namespace Dml
 
     void ExecutionContext::ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.ResourceBarrier(barriers);
@@ -120,6 +131,7 @@ namespace Dml
 
     void ExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** commandList)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         // Ensure the descriptor heap is reset to D3D as something external may change it before recording
@@ -130,6 +142,8 @@ namespace Dml
 
     void ExecutionContext::SetCommandRecorder(ICommandRecorder* newRecorder)
     {
+        assert(!m_closed);
+
         // If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct
         // ordering of operations on the command queue.
         if (m_currentRecorder != newRecorder)
@@ -146,6 +160,8 @@ namespace Dml
 
     void ExecutionContext::Flush()
     {
+        assert(!m_closed);
+
         if (!m_currentRecorder || !m_currentRecorder->HasUnsubmittedWork())
         {
             // Nothing to flush
@@ -164,21 +180,34 @@ namespace Dml
 
     void ExecutionContext::QueueReference(IUnknown* object)
     {
+        assert(!m_closed);
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
         // value is the one to signal completion.
         bool waitForUnsubmittedWork = (m_currentRecorder != nullptr);
         m_queue->QueueReference(object, waitForUnsubmittedWork);
     }
 
-    void ExecutionContext::WaitForSignalAndClearQueue()
+    void ExecutionContext::Close()
     {
+        assert(!m_closed);
+
         // Discard unflushed work and clear queued references.  This prevents the circular reference:
         // Kernel --> ProviderImpl -->  Context --> QueuedRefs --> Kernel
-        m_queue->WaitForSignalAndClearQueue();
+        m_queue->Close();
+
+        // Keep the execution context open when requested, e.g. when used through the python API where there's a single context
+        // and single command queue
+        if (!m_keepOpen)
+        {
+            m_currentRecorder = nullptr;
+            m_closed = true;
+        }
     }
 
     GpuEvent ExecutionContext::GetCurrentCompletionEvent()
     {
+        assert(!m_closed);
+
         GpuEvent event = m_queue->GetCurrentCompletionEvent();
 
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
@@ -194,11 +223,13 @@ namespace Dml
 
     void ExecutionContext::ReleaseCompletedReferences()
     {
+        assert(!m_closed);
         m_queue->ReleaseCompletedReferences();
     }
 
     D3D12_COMMAND_LIST_TYPE ExecutionContext::GetCommandListTypeForQueue() const
     {
+        assert(!m_closed);
         return m_queue->GetType();
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index 71aa26f4a0148..e7a6fa3d07296 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -23,13 +23,14 @@ namespace Dml
             ID3D12Device* d3d12Device,
             IDMLDevice* dmlDevice,
             ID3D12CommandQueue* queue,
-            bool cpuSyncSpinningEnabled);
+            bool cpuSyncSpinningEnabled,
+            bool keepOpen);
 
         void SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
 
         // Waits for flushed work, discards unflushed work, and discards associated references to
-        // prevent circular references.
-        void WaitForSignalAndClearQueue();
+        // prevent circular references.  Must be the last call on the object before destruction.
+        void Close();
 
         // Queues a CopyBufferRegion (see ID3D12GraphicsCommandList::CopyBufferRegion) for execution. Transition
         // barriers are automatically inserted to transition the source and destination resources to COPY_SOURCE and
@@ -86,6 +87,7 @@ namespace Dml
 
         D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
         bool CpuSyncSpinningEnabled() const { return m_cpuSyncSpinningEnabled; }
+        bool IsClosed() const { return m_closed; }
 
     private:
         Microsoft::WRL::ComPtr<ID3D12Device> m_d3dDevice;
@@ -101,6 +103,10 @@ namespace Dml
 
         bool m_closed = false;
         bool m_cpuSyncSpinningEnabled = false;
+
+        // The python API has a global state used for I/O binding where the execution context is shared between session,
+        // so we don't want to close the context when one of the sessions is destroyed
+        bool m_keepOpen = false;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index ae9be4ea91c28..228dfeb123175 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -106,26 +106,7 @@ namespace Dml
         // Release the cached command list references before closing the context
         m_capturedGraphs.clear();
 
-        // Close the allocator before clearing the command queue to stop it from
-        // appending resources to it in an attempt to keep them alive.
-        if (m_allocator)
-        {
-            m_allocator->Close();
-        }
-
-        // Destroy the allocators. We are closing the execution provider, so from now on the
-        // only thing it will be used for is doing copies via the DataTransfer, which doesn't
-        // require allocating any memory.
-        // TODO: Move the copy functions over to ExecutionContext so that we are able to cleanly
-        // destroy ExecutionProviderImpl, and instead have the DataTransfer keep the context alive.
-        m_allocator = nullptr;
-        m_cpuInputAllocator = nullptr;
-
-        // Wait for all pending commands to be done executing and empty the command queue. This will
-        // Force all kernels and resources in flight to get destroyed and, from this point forward,
-        // ExecutionProviderImpl will only be used to execute transfer between resources that are
-        // already existing via the DataTransfer;
-        m_context->WaitForSignalAndClearQueue();
+        m_context->Close();
     }
 
     void ExecutionProviderImpl::WaitForOutstandingWork()
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 89decfef6fef6..e8fe235fc1d46 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -86,11 +86,11 @@ std::unique_ptr<IExecutionProvider> DMLProviderFactory::CreateProvider() {
 
     // First, check if an I/O binding API that was used before this session or another session has already created a queue
     if (FAILED(d3d12_device->GetPrivateData(dml_execution_context_guid, &execution_context_ptr_size, execution_context.GetAddressOf()))) {
-      execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true);
+      execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true);
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, execution_context.Get()));
     }
   } else {
-    execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_);
+    execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false);
   }
 
   auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), execution_context.Get(), metacommands_enabled_, graph_capture_enabled_, cpu_sync_spinning_enabled_, disable_memory_arena_);
diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc
index 599c41f79a537..466edce187a56 100644
--- a/onnxruntime/core/session/lora_adapters.cc
+++ b/onnxruntime/core/session/lora_adapters.cc
@@ -4,9 +4,10 @@
 #include "core/session/lora_adapters.h"
 #include "lora/adapter_format_utils.h"
 
+#include <unordered_map>
+
 #include "core/framework/data_transfer.h"
 #include "core/framework/error_code_helper.h"
-#include "core/framework/execution_provider.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/allocator_adapters.h"
 #include "core/session/ort_apis.h"
@@ -15,15 +16,6 @@
 #include "core/providers/cuda/cuda_provider_factory.h"
 #endif
 
-#ifdef USE_DML
-#include "core/session/abi_session_options_impl.h"
-#include "core/providers/dml/dml_provider_factory_creator.h"
-#include "core/providers/dml/dml_provider_factory.h"
-#endif
-
-#include <functional>
-#include <unordered_map>
-
 namespace onnxruntime {
 
 #ifdef USE_CUDA
@@ -58,58 +50,28 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) {
   InitializeParamsValues();
 }
 
-namespace {
-struct DataTransfer {
-  std::unique_ptr<IExecutionProvider> ep;
+static std::unique_ptr<IDataTransfer> GetDataTransfer(const OrtMemoryInfo& mem_info) {
   std::unique_ptr<IDataTransfer> data_transfer;
-  bool is_dml = false;
-  Status CopyTensor(const Tensor& src, Tensor& dst) const {
-    return data_transfer->CopyTensor(src, dst);
-  }
-  Status Sync() const {
-    if (is_dml) {
-      return ep->Sync();
-    } else {
-      return Status::OK();
-    }
-  }
-};
-}  // namespace
 
-static Status GetDataTransfer(const OrtMemoryInfo& mem_info, [[maybe_unused]] DataTransfer& dt) {
-  ORT_RETURN_IF(strcmp(mem_info.name, onnxruntime::CPU) == 0, "Expecting on device allocator for LoraAdapter");
+  if (strcmp(mem_info.name, onnxruntime::CPU) == 0) {
+    return data_transfer;
+  }
 
-  Status status;
   if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) {
 #ifdef USE_CUDA
     auto* cuda_provider_info = TryGetProviderInfo_CUDA();
     if (cuda_provider_info != nullptr) {
-      dt.data_transfer = cuda_provider_info->CreateGPUDataTransfer();
-    } else {
-      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider could not be loaded");
+      data_transfer = cuda_provider_info->CreateGPUDataTransfer();
     }
-#else
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider is not enabled in this build");
-#endif
-  } else if (strcmp(mem_info.name, onnxruntime::DML) == 0) {
-#ifdef USE_DML
-    auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false);
-    dt.ep = ep_factory->CreateProvider();
-    dt.is_dml = true;
-    dt.data_transfer = dt.ep->GetDataTransfer();
-#else
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DML provider is not enabled in this build");
 #endif
-  } else {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported device allocator");
   }
 
-  return status;
+  return data_transfer;
 }
 
 static Status CreateOrtValueOnDevice(const OrtValue& ort_value_mapped,
                                      const AllocatorPtr& device_allocator,
-                                     const DataTransfer& data_transfer,
+                                     const IDataTransfer& data_transfer,
                                      OrtValue& out) {
   OrtValue result;
   const auto& src = ort_value_mapped.Get<Tensor>();
@@ -125,9 +87,12 @@ void LoraAdapter::InitializeParamsValues() {
     ORT_THROW("Adapter is not loaded yet.");
   }
 
-  DataTransfer data_transfer;
+  std::unique_ptr<IDataTransfer> data_transfer;
   if (device_allocator_) {
-    ORT_THROW_IF_ERROR(GetDataTransfer(device_allocator_->Info(), data_transfer));
+    data_transfer = GetDataTransfer(device_allocator_->Info());
+    if (data_transfer == nullptr) {
+      ORT_THROW("Data transfer is not available for the specified device allocator, it also must not be a CPU allocator");
+    }
   }
 
   const auto* params = adapter_->parameters();
@@ -135,12 +100,12 @@ void LoraAdapter::InitializeParamsValues() {
   std::unordered_map<std::string, Param> params_values;
   params_values.reserve(params->size());
   // Re-work in two separate loops due to compiler issues
-  if (device_allocator_) {
+  if (data_transfer) {
     for (const auto* param : *params) {
       auto [name, ort_value] = adapters::utils::CreateOrtValueOverLoraParameter(*param);
       OrtValue ort_value_ondevice;
       ORT_THROW_IF_ERROR(CreateOrtValueOnDevice(ort_value, device_allocator_,
-                                                data_transfer, ort_value_ondevice));
+                                                *data_transfer, ort_value_ondevice));
       Param lora_param(std::move(ort_value), std::move(ort_value_ondevice));
       params_values.emplace(std::move(name), std::move(lora_param));
     }
@@ -152,10 +117,6 @@ void LoraAdapter::InitializeParamsValues() {
     }
   }
 
-  if (device_allocator_) {
-    ORT_THROW_IF_ERROR(data_transfer.Sync());
-  }
-
   params_values_.swap(params_values);
 }
 
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 74bd20461efea..92396bb09bd4c 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -226,7 +226,7 @@ AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
       auto dml_device = onnxruntime::DMLProviderFactoryCreator::CreateDMLDevice(d3d12_device.Get());
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_device_guid, dml_device.Get()));
 
-      context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get(), true);
+      context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get(), true, true);
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, context.Get()));
     }
 
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index 8338c7d547a09..e8291a36447ca 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -200,19 +200,13 @@ TEST(LoraAdapterTest, Load) {
 }
 
 #ifdef USE_CUDA
-TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    GTEST_SKIP() << "Skip This Test Due to this EP is null";
-  }
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    GTEST_FAIL() << "It should not run with DML EP";
-  }
-#endif
+TEST(LoraAdapterTest, VerifyDeviceCopy) {
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
-  auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0];
-  auto cuda_transfer = DefaultCudaExecutionProvider()->GetDataTransfer();
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  auto cuda_allocator = cuda_ep->CreatePreferredAllocators()[0];
+
+  auto gpu_transfer = cuda_ep->GetDataTransfer();
 
   auto test_params = GenerateTestParameters<float>()();
   lora::LoraAdapter adapter(std::move(cuda_allocator));
@@ -228,54 +222,9 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
     ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
 
     Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
-    ASSERT_TRUE(cuda_transfer->CanCopy(tensor_device.Location().device,
-                                       copy.Location().device));
-    ASSERT_STATUS_OK(cuda_transfer->CopyTensor(tensor_device, copy));
-
-    auto expected_span = tensor_cpu.DataAsSpan<float>();
-    auto copy_span = copy.DataAsSpan<float>();
-
-    ASSERT_EQ(expected_span, copy_span);
-  }
-}
-#endif
-
-#ifdef USE_DML
-TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
-  // NO_DML_TEST is set, DML test is skipped
-  if (DefaultDmlExecutionProvider() == nullptr) {
-    GTEST_SKIP() << "Skip This Test Due to this EP is null";
-  }
-
-#ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    GTEST_FAIL() << "It should not run with CUDA EP";
-  }
-#endif
-
-  auto cpu_ep = DefaultCpuExecutionProvider();
-  auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
-
-  auto dml_allocator = DefaultDmlExecutionProvider()->CreatePreferredAllocators()[0];
-  auto dml_transfer = DefaultDmlExecutionProvider()->GetDataTransfer();
-
-  auto test_params = GenerateTestParameters<float>()();
-  lora::LoraAdapter adapter(std::move(dml_allocator));
-  adapter.Load(std::move(test_params));
-
-  auto [begin, end] = adapter.GetParamIterators();
-  for (; begin != end; ++begin) {
-    const auto& [_, param] = *begin;
-    const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
-    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::DML));
-
-    const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
-    ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
-
-    Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
-    ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device,
+    ASSERT_TRUE(gpu_transfer->CanCopy(tensor_device.Location().device,
                                       copy.Location().device));
-    ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy));
+    ASSERT_STATUS_OK(gpu_transfer->CopyTensor(tensor_device, copy));
 
     auto expected_span = tensor_cpu.DataAsSpan<float>();
     auto copy_span = copy.DataAsSpan<float>();
@@ -284,6 +233,5 @@ TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
   }
 }
 #endif
-
 }  // namespace test
 }  // namespace onnxruntime