From 38f889e77de6050818e10cd1681149ba87e44f6f Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 30 Jun 2025 12:55:01 +0000
Subject: [PATCH 01/74] Add L0 support for gpu

---
 .github/workflows/android_arm64.yml           |   1 +
 .github/workflows/linux_riscv.yml             |   1 +
 cmake/features.cmake                          |   9 +
 .../runtime/intel_gpu/remote_properties.hpp   |   5 +
 src/plugins/intel_gpu/CMakeLists.txt          |   3 +
 src/plugins/intel_gpu/cmake/utils.cmake       |  15 +
 .../intel_gpu/plugin/remote_context.hpp       |  25 ++
 .../include/intel_gpu/runtime/device.hpp      |  24 +
 .../include/intel_gpu/runtime/device_info.hpp |  47 +-
 .../runtime/engine_configuration.hpp          |   8 +-
 .../include/intel_gpu/runtime/kernel.hpp      |   7 +-
 .../intel_gpu/src/graph/CMakeLists.txt        |   1 +
 .../graph_optimizer/add_required_reorders.cpp |   1 +
 .../graph/graph_optimizer/handle_reshape.cpp  |   1 +
 .../mark_runtime_skippable_nodes.cpp          |   2 +-
 .../graph_optimizer/post_optimize_weights.cpp |   1 +
 .../graph_optimizer/prepare_buffer_fusing.cpp |   1 +
 .../prepare_primitive_fusing.cpp              |   1 +
 .../prepare_primitive_fusing_through.cpp      |   1 +
 .../remove_redundant_reorders.cpp             |   1 +
 .../src/graph/impls/ocl/custom_primitive.cpp  |   4 +-
 .../src/graph/impls/ocl/kernels_cache.cpp     |  15 +-
 .../src/graph/impls/ocl/kernels_cache.hpp     |   2 +-
 .../graph/impls/ocl/multi_stage_primitive.hpp |   7 +-
 .../src/graph/impls/ocl/primitive_base.hpp    |  18 +-
 .../intel_gpu/src/graph/impls/ocl/reorder.hpp |   2 +
 .../graph/impls/ocl_v2/primitive_ocl_base.hpp |   4 +-
 .../src/graph/include/primitive_inst.h        |   2 +-
 src/plugins/intel_gpu/src/graph/program.cpp   |   2 +-
 .../convolution/convolution_kernel_ref.cpp    |   2 +-
 src/plugins/intel_gpu/src/plugin/plugin.cpp   |   8 +-
 .../intel_gpu/src/plugin/remote_context.cpp   |  17 +-
 .../intel_gpu/src/runtime/CMakeLists.txt      |  19 +-
 .../intel_gpu/src/runtime/device_query.cpp    |  21 +-
 src/plugins/intel_gpu/src/runtime/engine.cpp  |   6 +
 src/plugins/intel_gpu/src/runtime/memory.cpp  |   2 +
 .../intel_gpu/src/runtime/ocl/ocl_device.cpp  |  38 +-
 .../src/runtime/ocl/ocl_device_detector.cpp   |  23 -
 .../src/runtime/ocl/ocl_device_detector.hpp   |   2 -
 .../intel_gpu/src/runtime/ocl/ocl_kernel.cpp  |  42 ++
 .../intel_gpu/src/runtime/ocl/ocl_kernel.hpp  |   2 +
 .../src/runtime/ze/ze_base_event.hpp          |  25 ++
 .../intel_gpu/src/runtime/ze/ze_common.cpp    |  39 ++
 .../intel_gpu/src/runtime/ze/ze_common.hpp    |  32 ++
 .../intel_gpu/src/runtime/ze/ze_device.cpp    | 273 ++++++++++++
 .../intel_gpu/src/runtime/ze/ze_device.hpp    |  43 ++
 .../src/runtime/ze/ze_device_detector.cpp     | 118 +++++
 .../src/runtime/ze/ze_device_detector.hpp     |  32 ++
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    | 266 +++++++++++
 .../intel_gpu/src/runtime/ze/ze_engine.hpp    |  64 +++
 .../src/runtime/ze/ze_engine_factory.hpp      |  18 +
 .../intel_gpu/src/runtime/ze/ze_event.cpp     | 216 +++++++++
 .../intel_gpu/src/runtime/ze/ze_event.hpp     |  94 ++++
 .../src/runtime/ze/ze_event_pool.cpp          |  78 ++++
 .../src/runtime/ze/ze_event_pool.hpp          |  46 ++
 .../intel_gpu/src/runtime/ze/ze_kernel.hpp    |  62 +++
 .../intel_gpu/src/runtime/ze/ze_memory.cpp    | 236 ++++++++++
 .../intel_gpu/src/runtime/ze/ze_memory.hpp    | 153 +++++++
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    | 348 +++++++++++++++
 .../intel_gpu/src/runtime/ze/ze_stream.hpp    |  69 +++
 .../intel_gpu/tests/functional/CMakeLists.txt |   2 +
 ..._tests.cpp => ocl_remote_tensor_tests.cpp} |   3 +
 .../ze_remote_tensor_tests.cpp                |  20 +
 .../intel_gpu/tests/unit/CMakeLists.txt       |   1 +
 .../tests/unit/module_tests/device_test.cpp   |   6 +-
 .../tests/unit/test_utils/test_utils.cpp      |   2 +-
 thirdparty/dependencies.cmake                 |   2 +-
 thirdparty/level_zero/CMakeLists.txt          |   2 +-
 thirdparty/level_zero/ze_intel_gpu.h          | 412 ++++++++++++++++++
 thirdparty/level_zero/ze_stypes.h             |  43 ++
 70 files changed, 2962 insertions(+), 136 deletions(-)
 create mode 100644 src/plugins/intel_gpu/cmake/utils.cmake
 create mode 100644 src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
 rename src/plugins/intel_gpu/tests/functional/remote_tensor_tests/{gpu_remote_tensor_tests.cpp => ocl_remote_tensor_tests.cpp} (99%)
 create mode 100644 src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
 create mode 100644 thirdparty/level_zero/ze_intel_gpu.h
 create mode 100644 thirdparty/level_zero/ze_stypes.h

diff --git a/.github/workflows/android_arm64.yml b/.github/workflows/android_arm64.yml
index 77711cacce1a09..e62ceca6f7a26b 100644
--- a/.github/workflows/android_arm64.yml
+++ b/.github/workflows/android_arm64.yml
@@ -128,6 +128,7 @@ jobs:
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/json
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gtest
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gflags
+            git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero
           popd
 
       - name: Clone vcpkg
diff --git a/.github/workflows/linux_riscv.yml b/.github/workflows/linux_riscv.yml
index 2f57ace4bd7ef2..6b34b834497295 100644
--- a/.github/workflows/linux_riscv.yml
+++ b/.github/workflows/linux_riscv.yml
@@ -150,6 +150,7 @@ jobs:
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/telemetry
             git submodule update --init -- ${OPENVINO_REPO}/src/plugins/intel_cpu
             git submodule update --init -- ${OPENVINO_REPO}/thirdparty/flatbuffers/flatbuffers
+            git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero
           popd
 
       #
diff --git a/cmake/features.cmake b/cmake/features.cmake
index dc8ebeeb9371ad..0d4cb16fd241dd 100644
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -41,6 +41,15 @@ else()
     set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON)
 endif()
 
+set(OV_GPU_DEFAULT_RT "L0")
+if (ENABLE_INTEL_GPU)
+    ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL)
+    if (GPU_RT_TYPE STREQUAL "L0")
+        # There's no interop with native L0 in onednn API. Temporary disable onednn when L0 runtime is selected
+        set(ENABLE_ONEDNN_FOR_GPU_DEFAULT OFF)
+    endif()
+endif()
+
 ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF)
 
 ov_dependent_option (ENABLE_INTEL_NPU "NPU plugin for OpenVINO runtime" ON "X86_64;WIN32 OR LINUX" OFF)
diff --git a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp
index 53c8de921a747d..cad287b56059bb 100644
--- a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp
@@ -24,6 +24,7 @@ using gpu_handle_param = void*;
 enum class ContextType {
     OCL = 0,        //!< Pure OpenCL context
     VA_SHARED = 1,  //!< Context shared with a video decoding device
+    ZE = 2,         //!< Pure Level0 context
 };
 
 /** @cond INTERNAL */
@@ -33,6 +34,8 @@ inline std::ostream& operator<<(std::ostream& os, const ContextType& context_typ
         return os << "OCL";
     case ContextType::VA_SHARED:
         return os << "VA_SHARED";
+    case ContextType::ZE:
+        return os << "ZE";
     default:
         OPENVINO_THROW("Unsupported context type");
     }
@@ -43,6 +46,8 @@ inline std::istream& operator>>(std::istream& is, ContextType& context_type) {
     is >> str;
     if (str == "OCL") {
         context_type = ContextType::OCL;
+    } else if (str == "ZE") {
+        context_type = ContextType::ZE;
     } else if (str == "VA_SHARED") {
         context_type = ContextType::VA_SHARED;
     } else {
diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt
index 9556ab5873c616..58f5e74330a4d4 100644
--- a/src/plugins/intel_gpu/CMakeLists.txt
+++ b/src/plugins/intel_gpu/CMakeLists.txt
@@ -8,6 +8,8 @@ endif()
 
 set (TARGET_NAME "openvino_intel_gpu_plugin")
 
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+
 if(OV_COMPILER_IS_INTEL_LLVM)
     # For windows we need to disable warning as error option to make FindSYCL.cmake work
     if (WIN32)
@@ -77,6 +79,7 @@ target_include_directories(${TARGET_NAME} PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/include/)
 
 ov_set_threading_interface_for(${TARGET_NAME})
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 
diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake
new file mode 100644
index 00000000000000..1dc0edebb5fe2d
--- /dev/null
+++ b/src/plugins/intel_gpu/cmake/utils.cmake
@@ -0,0 +1,15 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+function(ov_gpu_set_runtime_interface_for TARGET_NAME)
+    if(GPU_RT_TYPE STREQUAL "L0")
+        target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1)
+        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
+    elseif(GPU_RT_TYPE STREQUAL "OCL")
+        target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
+        target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL)
+    else()
+        message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported")
+    endif()
+endfunction()
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
index 7425f701f88710..3606e95e5d9521 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
@@ -8,6 +8,7 @@
 # define NOMINMAX
 #endif
 
+#include "intel_gpu/runtime/engine_configuration.hpp"
 #include "openvino/runtime/intel_gpu/remote_properties.hpp"
 #include "openvino/runtime/iremote_context.hpp"
 
@@ -22,6 +23,26 @@
 
 namespace ov::intel_gpu {
 
+inline std::pair<cldnn::engine_types, cldnn::runtime_types> get_device_query_params() {
+    #ifdef OV_GPU_WITH_ZE_RT
+        auto runtime_type = cldnn::runtime_types::ze;
+        #ifdef OV_GPU_WITH_SYCL
+            auto engine_type = cldnn::engine_types::sycl;
+        #else
+            auto engine_type = cldnn::engine_types::ze;
+        #endif
+    #else
+        auto runtime_type = cldnn::runtime_types::ocl;
+        #ifdef OV_GPU_WITH_SYCL
+            auto engine_type = cldnn::engine_types::sycl;
+        #else
+            auto engine_type = cldnn::engine_types::ocl;
+        #endif
+    #endif
+
+    return {engine_type, runtime_type};
+}
+
 class RemoteContextImpl : public ov::IRemoteContext {
 public:
     using Ptr = std::shared_ptr<RemoteContextImpl>;
@@ -93,7 +114,11 @@ class RemoteContextImpl : public ov::IRemoteContext {
     ov::intel_gpu::gpu_handle_param m_va_display = nullptr;
     ov::intel_gpu::gpu_handle_param m_external_queue = nullptr;
 
+#ifdef OV_GPU_WITH_ZE_RT
+    ContextType m_type = ContextType::ZE;
+#else
     ContextType m_type = ContextType::OCL;
+#endif
     std::string m_device_name = "";
     static const size_t cache_capacity = 100;
     cldnn::LruCache<size_t, cldnn::memory::ptr> m_memory_cache = cldnn::LruCache<size_t, cldnn::memory::ptr>(cache_capacity);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp
index 7c567e877d7552..ef885414c6f1c0 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp
@@ -33,4 +33,28 @@ struct device {
     virtual ~device() = default;
 };
 
+// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration
+// Lower priority value means lower device ID
+// Current behavior is: Intel iGPU < Intel dGPU < any other GPU
+// Order of Intel dGPUs is undefined and depends on the OCL impl
+// Order of other vendor GPUs is undefined and depends on the OCL impl
+inline size_t get_device_priority(const cldnn::device_info& info) {
+    if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) {
+        return 0;
+    } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) {
+        return 1;
+    } else {
+        return std::numeric_limits<size_t>::max();
+    }
+}
+
+inline std::vector<device::ptr> sort_devices(const std::vector<device::ptr>& devices_list) {
+    std::vector<device::ptr> sorted_list = devices_list;
+    std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1,  device::ptr d2) {
+        return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info());
+    });
+
+    return sorted_list;
+}
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index 319ca366a9dea2..738515a67b9a1b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -46,13 +46,13 @@ struct gfx_version {
                < std::tie(r.major, r.minor, r.revision); // same order
     }
 
-    bool operator==(const gfx_version& other) {
+    bool operator==(const gfx_version& other) const {
         return major == other.major &&
                minor == other.minor &&
                revision == other.revision;
     }
 
-    bool operator!=(const gfx_version& other) {
+    bool operator!=(const gfx_version& other) const {
         return !(*this == other);
     }
 };
@@ -63,14 +63,14 @@ struct pci_bus_info {
     uint32_t pci_device = 0;
     uint32_t pci_function = 0;
 
-    bool operator==(const pci_bus_info& other) {
+    bool operator==(const pci_bus_info& other) const {
         return pci_domain == other.pci_domain &&
                pci_bus == other.pci_bus &&
                pci_device == other.pci_device &&
                pci_function == other.pci_function;
     }
 
-    bool operator!=(const pci_bus_info& other) {
+    bool operator!=(const pci_bus_info& other) const {
         return !(*this == other);
     }
 };
@@ -105,6 +105,8 @@ struct device_info {
     bool supports_imad;                         ///< Does engine support int8 mad.
     bool supports_immad;                        ///< Does engine support int8 multi mad.
 
+    bool supports_mutable_command_list;         ///< Does the target runtime/device support mutable command list feature
+
     bool supports_usm;                          ///< Does engine support unified shared memory.
     bool has_separate_cache;                    ///< Does the target hardware has separate cache for usm_device and usm_host
 
@@ -129,8 +131,45 @@ struct device_info {
 
     pci_bus_info pci_info;                      ///< PCI bus information for the device
 
+    uint64_t timer_resolution;                  ///< Resolution of device timer used for profiling in cycles/sec
+    uint32_t kernel_timestamp_valid_bits;       ///< Number of valid bits in the kernel timestamp values
+    uint32_t compute_queue_group_ordinal;       ///< Ordinal of the command queue group with compute support
+    uint32_t device_memory_ordinal;             ///< Ordinal of the selected global device memory
+
     ov::device::UUID uuid;                      ///< UUID of the gpu device
     ov::device::LUID luid;                      ///< LUID of the gpu device
+
+    inline bool is_same_device(const device_info &other) const {
+        // Relying solely on the UUID is not reliable in all the cases (particularly on legacy platforms),
+        // where the UUID may be missing or incorrectly generated
+        // Therefore, we also validate other attributes
+        if (uuid.uuid != other.uuid.uuid)
+            return false;
+
+        if (pci_info != other.pci_info)
+            return false;
+
+        if (sub_device_idx != other.sub_device_idx)
+            return false;
+
+        if (vendor_id != other.vendor_id ||
+            dev_name != other.dev_name ||
+            driver_version != other.driver_version)
+            return false;
+
+        if (dev_type != other.dev_type ||
+            gfx_ver != other.gfx_ver ||
+            arch != other.arch)
+            return false;
+
+        if (ip_version != other.ip_version || device_id != other.device_id)
+            return false;
+
+        if (execution_units_count != other.execution_units_count || max_global_mem_size != other.max_global_mem_size)
+            return false;
+
+        return true;
+    }
 };
 
 /// @}
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp
index abe01b0cc5da22..8eea9df0169ab2 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp
@@ -4,20 +4,22 @@
 
 #pragma once
 
-#include <string>
+#include <ostream>
 
 namespace cldnn {
 
 /// @brief Defines available engine types
 enum class engine_types : int32_t {
     ocl,
-    sycl
+    sycl,
+    ze
 };
 
 inline std::ostream& operator<<(std::ostream& os, const engine_types& type) {
     switch (type) {
     case engine_types::ocl: os << "ocl"; break;
     case engine_types::sycl: os << "sycl"; break;
+    case engine_types::ze: os << "ze"; break;
     default: os << "unknown"; break;
     }
 
@@ -27,11 +29,13 @@ inline std::ostream& operator<<(std::ostream& os, const engine_types& type) {
 /// @brief Defines available runtime types
 enum class runtime_types : int32_t {
     ocl,
+    ze,
 };
 
 inline std::ostream& operator<<(std::ostream& os, const runtime_types& type) {
     switch (type) {
     case runtime_types::ocl: os << "ocl"; break;
+    case runtime_types::ze: os << "ze"; break;
     default: os << "unknown"; break;
     }
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
index 8dcd9d07d0f246..6c55df8507c812 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
@@ -4,9 +4,6 @@
 
 #pragma once
 
-#include "kernel_args.hpp"
-#include "event.hpp"
-
 #include <memory>
 #include <vector>
 
@@ -19,7 +16,9 @@ class kernel {
     using ptr = std::shared_ptr<kernel>;
     virtual std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const = 0;
     virtual ~kernel() = default;
-    virtual std::string get_id() const { return ""; }
+
+    virtual std::string get_id() const = 0;
+    virtual std::vector<uint8_t> get_binary() const = 0;
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/CMakeLists.txt
index b3ee51e242d81f..4611d731429ce4 100644
--- a/src/plugins/intel_gpu/src/graph/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/graph/CMakeLists.txt
@@ -137,6 +137,7 @@ ov_build_target_faster(${TARGET_NAME} PCH)
 target_compile_options(${TARGET_NAME} PRIVATE ${COMMON_COMPILE_OPTIONS})
 
 ov_set_threading_interface_for(${TARGET_NAME})
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
index e2523ab8c870f8..74e6165a685d60 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
@@ -9,6 +9,7 @@
 #include "fully_connected_inst.h"
 #include "assign_inst.h"
 #include "mvn_inst.h"
+#include "reorder_inst.h"
 
 #include <algorithm>
 #include <memory>
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
index 3f07b505388de4..b7abdd649f71bf 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
@@ -5,6 +5,7 @@
 #include "pass_manager.h"
 #include "program_helpers.h"
 #include "reshape_inst.h"
+#include "reorder_inst.h"
 #include <vector>
 #include <memory>
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp
index 0eb670dd067767..9935c39f02e902 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp
@@ -12,7 +12,7 @@
 #include "shape_of_inst.h"
 #include "broadcast_inst.h"
 #include "non_zero_inst.h"
-#include "non_max_suppression_inst.h"
+#include "reorder_inst.h"
 #include "unique_inst.hpp"
 #include "scatter_elements_update_inst.h"
 #include "scatter_update_inst.h"
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
index c7296ea35b8940..56239fb0263165 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@@ -9,6 +9,7 @@
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
 #include "fully_connected_inst.h"
+#include "reorder_inst.h"
 #include "lstm_seq_inst.h"
 #include "gru_seq_inst.h"
 #include "intel_gpu/runtime/format.hpp"
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 124d61db911415..1b53f49c673fa1 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -11,6 +11,7 @@
 #include "crop_inst.h"
 #include "eltwise_inst.h"
 #include "gemm_inst.h"
+#include "reorder_inst.h"
 #include "assign_inst.h"
 #include "read_value_inst.h"
 #include "reshape_inst.h"
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index 8f0addbd605652..a1dec0ca256e5f 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -12,6 +12,7 @@
 #include "activation_inst.h"
 #include "batch_to_space_inst.h"
 #include "crop_inst.h"
+#include "reorder_inst.h"
 #include "eltwise_inst.h"
 #include "gemm_inst.h"
 #include "lrn_inst.h"
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp
index 8021237c68179e..4282585885e9d3 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp
@@ -10,6 +10,7 @@
 #include "data_inst.h"
 #include "eltwise_inst.h"
 #include "mutable_data_inst.h"
+#include "reorder_inst.h"
 #include <vector>
 #include <memory>
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index 5a661a90989186..9dd5fbbc6a1374 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -22,6 +22,7 @@
 #include "fully_connected_inst.h"
 #include "group_normalization_inst.h"
 #include "mvn_inst.h"
+#include "reorder_inst.h"
 
 #include <vector>
 #include <list>
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
index 4aeadc7a297da3..816efd32e37293 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
@@ -62,8 +62,8 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
-        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0]));
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
+        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0], e));
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
index 9bd724e03e80a9..2d448e99ce3264 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
@@ -493,11 +493,11 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co
     }
 }
 
-kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const {
+kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const engine& e) const {
     auto res = _cached_kernels.find(id);
     OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");
 
-    return res->second->clone(_reuse_kernels);
+    return e.prepare_kernel(res->second->clone(_reuse_kernels));
 }
 
 std::vector<kernel::ptr> kernels_cache::get_kernels(const kernel_impl_params& params) const {
@@ -642,15 +642,12 @@ void kernels_cache::add_kernels_source(const kernel_impl_params& params,
 }
 
 std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const {
-    auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
-    const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
-    auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
-    cl::vector<unsigned char> program_binaries = getProgramBinaries(std::move(program));
+    auto program_binaries = kernel->get_binary();
 
     auto iter = _cached_binaries.find(program_binaries);
     OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries");
 
-    return entry_point + "@" + std::to_string(iter->second);
+    return kernel->get_id() + "@" + std::to_string(iter->second);
 }
 
 std::vector<std::string> kernels_cache::get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const {
@@ -668,9 +665,7 @@ void kernels_cache::add_to_cached_kernels(const std::vector<kernel::ptr>& kernel
     static std::atomic<uint32_t> id_gen{0};
 
     for (auto& kernel : kernels) {
-        auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
-        auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
-        cl::vector<unsigned char> program_binaries = getProgramBinaries(std::move(program));
+        auto program_binaries = kernel->get_binary();
 
         std::lock_guard<std::mutex> lock(_mutex);
         auto iter = _cached_binaries.find(program_binaries);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
index dc77442fedef47..e623c547bf691d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
@@ -121,7 +121,7 @@ class kernels_cache {
                            uint32_t prog_id,
                            std::shared_ptr<ov::threading::ITaskExecutor> task_executor = nullptr,
                            const std::map<std::string, std::string>& batch_headers = {});
-    kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
+    kernel::ptr get_kernel_from_cached_kernels(std::string id, const engine& e) const;
     std::vector<kernel::ptr> get_kernels(const kernel_impl_params& params) const;
 
     void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
index 71e8571467d322..482184340171c5 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
@@ -126,12 +126,12 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
         _kernels.clear();
 
         _kernels.reserve(cached_kernel_ids.size());
         for (size_t k = 0; k < cached_kernel_ids.size(); ++k) {
-            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k]));
+            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e));
         }
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
@@ -221,11 +221,12 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
     void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
         OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed.");
         auto& kernel_vec = kernels.begin()->second;
+        auto& engine = kernels.begin()->first.get_program().get_engine();
         _kernels.clear();
         _kernels.resize(kernel_vec.size());
         for (auto& k : kernel_vec) {
             auto sub_kernel_idx = k.second;
-            _kernels[sub_kernel_idx] = k.first;
+            _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first);
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
index 62300417735674..e19baa6b923f66 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -136,10 +136,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
-        if (is_cpu()) {
-            return;
-        }
-
         _kernels.clear();
         if (!_kernel_data.kernels.empty()) {
             auto compiled_kernels = kernels_cache.get_kernels(params);
@@ -153,15 +149,12 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
-        if (is_cpu()) {
-            return;
-        }
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
         _kernels.clear();
 
         _kernels.reserve(cached_kernel_ids.size());
         for (size_t k = 0; k < cached_kernel_ids.size(); ++k) {
-            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k]));
+            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e));
         }
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
@@ -198,7 +191,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
-        if (instance.can_be_optimized() || is_cpu()) {
+        if (instance.can_be_optimized()) {
             return;
         }
 
@@ -301,15 +294,14 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
-        if (is_cpu())
-            return;
         OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed.");
         auto& kernel_vec = kernels.begin()->second;
+        auto& engine = kernels.begin()->first.get_program().get_engine();
         _kernels.clear();
         _kernels.resize(kernel_vec.size());
         for (auto& k : kernel_vec) {
             auto sub_kernel_idx = k.second;
-            _kernels[sub_kernel_idx] = k.first;
+            _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first);
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp
index ce5701f2739cf2..08d254b65d7d62 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+
+#include "reorder_inst.h"
 #include "registry/implementation_manager.hpp"
 #include "intel_gpu/primitives/reorder.hpp"
 #include "program_node.h"
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
index 1f49d6ac4545da..3e789b54ab9e7a 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
@@ -132,10 +132,10 @@ struct PrimitiveImplOCL : public cldnn::primitive_impl {
         }
     }
 
-    void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
+    void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
         OPENVINO_ASSERT(cached_kernel_ids.size() == _order.size());
         for (size_t i = 0; i < cached_kernel_ids.size(); ++i) {
-            _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i]);
+            _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i], e);
         }
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
index 25e17dd000975c..3da50ef643911b 100644
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -78,7 +78,7 @@ struct primitive_impl {
     virtual bool is_cpu() const { return true; }
     virtual bool is_onednn() const { return false; }
     virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
-    virtual void init_by_cached_kernels(const kernels_cache&, std::vector<std::string>& cached_kernel_ids) {}
+    virtual void init_by_cached_kernels(const kernels_cache&, std::vector<std::string>& cached_kernel_ids, const engine& e) {}
     virtual std::vector<std::string> get_cached_kernel_ids(const kernels_cache&) { return {}; }
     virtual std::unique_ptr<primitive_impl> clone() const = 0;
     virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 34dd4b1b392d5a..6466cc14aaff9a 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1975,7 +1975,7 @@ void program::load(cldnn::BinaryInputBuffer& ib, std::shared_ptr<const ov::Model
 
             std::vector<std::string> cached_kernel_ids;
             ib >> cached_kernel_ids;
-            p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids);
+            p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids, _engine);
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp
index add6f7de92c313..373577e617f4c3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp
@@ -109,7 +109,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_Ref::SetDefault(const conv
 }
 
 KernelsPriority ConvolutionKernel_Ref::GetKernelsPriority(const Params& /*params*/) const {
-    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+    return FORCE_PRIORITY_1;
 }
 
 bool ConvolutionKernel_Ref::Validate(const Params& params) const {
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index 92f1a0a0d23410..c14e18aaf90e35 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -160,12 +160,8 @@ Plugin::Plugin() {
     set_device_name("GPU");
     register_primitives();
 
-    // Set OCL runtime which should be always available
-#ifdef OV_GPU_WITH_SYCL
-    cldnn::device_query device_query(cldnn::engine_types::sycl, cldnn::runtime_types::ocl);
-#else
-    cldnn::device_query device_query(cldnn::engine_types::ocl, cldnn::runtime_types::ocl);
-#endif
+    auto rt_params = get_device_query_params();
+    cldnn::device_query device_query(rt_params.first, rt_params.second);
     m_device_map = device_query.get_available_devices();
 
     // Set default configs for each device
diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
index 885ceb9bda4720..ac68496feb2d97 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
@@ -65,12 +65,11 @@ RemoteContextImpl::RemoteContextImpl(const std::map<std::string, RemoteContextIm
         }
     }
 
-    const auto engine_type = cldnn::engine_types::ocl;
-    const auto runtime_type = cldnn::runtime_types::ocl;
+    auto rt_params = get_device_query_params();
     const auto initialize_devices = true;
 
     // Use actual runtime and engine types
-    cldnn::device_query device_query(engine_type, runtime_type, context_id, m_va_display, ctx_device_id, target_tile_id, initialize_devices);
+    cldnn::device_query device_query(rt_params.first, rt_params.second, context_id, m_va_display, ctx_device_id, target_tile_id, initialize_devices);
     auto device_map = device_query.get_available_devices();
 
     OPENVINO_ASSERT(device_map.size() == 1, "[GPU] Exactly one device expected in case of context sharing, but ", device_map.size(), " found");
@@ -103,6 +102,9 @@ void RemoteContextImpl::init_properties() {
         properties.insert(ov::intel_gpu::context_type(ov::intel_gpu::ContextType::VA_SHARED));
         properties.insert(ov::intel_gpu::va_device(m_va_display));
         break;
+    case ContextType::ZE:
+        properties.insert(ov::intel_gpu::context_type(ov::intel_gpu::ContextType::ZE));
+        break;
     default:
         OPENVINO_THROW("[GPU] Unsupported shared context type ", m_type);
     }
@@ -243,15 +245,10 @@ void RemoteContextImpl::initialize() {
     std::call_once(m_initialize_flag, [this]() {
         GPU_DEBUG_INFO << "Initialize RemoteContext for " << m_device_name << " (" << m_device->get_info().dev_name << ")" << std::endl;
 
-#ifdef OV_GPU_WITH_SYCL
-        const auto engine_type = cldnn::engine_types::sycl;
-#else
-        const auto engine_type = cldnn::engine_types::ocl;
-#endif
-        const auto runtime_type = cldnn::runtime_types::ocl;
+        auto rt_params = get_device_query_params();
 
         m_device->initialize();  // Initialize associated device before use
-        m_engine = cldnn::engine::create(engine_type, runtime_type, m_device);
+        m_engine = cldnn::engine::create(rt_params.first, rt_params.second, m_device);
 
         init_properties();
 
diff --git a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt
index 85dfec05c41195..384ee9c0bef7a5 100644
--- a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt
@@ -16,15 +16,21 @@ file(GLOB LIBRARY_SOURCES_OCL
     "${CMAKE_CURRENT_SOURCE_DIR}/ocl/*.cpp"
 )
 
-set(LIBRARY_SOURCES_ALL
-    ${LIBRARY_SOURCES_MAIN}
-    ${LIBRARY_SOURCES_OCL}
-  )
-
+file(GLOB LIBRARY_SOURCES_ZE
+    "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.cpp"
+)
 file(GLOB_RECURSE SYCL_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/ocl/sycl_*.cpp")
 
 if(NOT OV_COMPILER_IS_INTEL_LLVM)
-    list(REMOVE_ITEM LIBRARY_SOURCES_ALL ${SYCL_SOURCES})
+    list(REMOVE_ITEM LIBRARY_SOURCES_OCL ${SYCL_SOURCES})
+endif()
+
+set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN} ${LIBRARY_SOURCES_OCL})
+
+if(GPU_RT_TYPE STREQUAL "L0")
+    list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_ZE})
 endif()
 
 add_library(${TARGET_NAME} STATIC ${LIBRARY_SOURCES_ALL})
@@ -60,6 +66,7 @@ if(ENABLE_ONEDNN_FOR_GPU)
 endif()
 
 ov_set_threading_interface_for(${TARGET_NAME})
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 target_link_libraries(${TARGET_NAME} PRIVATE
     OpenCL::OpenCL
diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp
index 6b6ce3787cf73b..8a0ee71944483c 100644
--- a/src/plugins/intel_gpu/src/runtime/device_query.cpp
+++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp
@@ -4,9 +4,9 @@
 
 #include "intel_gpu/runtime/device_query.hpp"
 #include "ocl/ocl_device_detector.hpp"
+#include "ze/ze_device_detector.hpp"
 
 #include <map>
-#include <string>
 
 namespace cldnn {
 int device_query::device_id = -1;
@@ -17,17 +17,22 @@ device_query::device_query(engine_types engine_type,
                            int ctx_device_id,
                            int target_tile_id,
                            bool initialize_devices) {
-    switch (engine_type) {
-    case engine_types::sycl:
-    case engine_types::ocl: {
-        if (runtime_type != runtime_types::ocl)
-            throw std::runtime_error("Unsupported runtime type for ocl engine");
-
+    switch (runtime_type) {
+    case runtime_types::ocl: {
+        OPENVINO_ASSERT(engine_type == engine_types::ocl || engine_type == engine_types::sycl);
         ocl::ocl_device_detector ocl_detector;
         _available_devices = ocl_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices);
         break;
     }
-    default: throw std::runtime_error("Unsupported engine type in device_query");
+#ifdef OV_GPU_WITH_ZE_RT
+    case runtime_types::ze: {
+        OPENVINO_ASSERT(engine_type == engine_types::ze);
+        ze::ze_device_detector ze_detector;
+        _available_devices = ze_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices);
+        break;
+    }
+#endif
+    default: OPENVINO_THROW("[GPU] Unsupported engine/runtime types in device_query");
     }
 }
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp
index 3af715d2cccf69..3311a265de1a89 100644
--- a/src/plugins/intel_gpu/src/runtime/engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/engine.cpp
@@ -10,6 +10,7 @@
 #include "intel_gpu/runtime/debug_configuration.hpp"
 
 #include "ocl/ocl_engine_factory.hpp"
+#include "ze/ze_engine_factory.hpp"
 
 #include <string>
 #include <vector>
@@ -257,6 +258,11 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_
     case engine_types::ocl:
         ret = ocl::create_ocl_engine(device, runtime_type);
         break;
+#ifdef OV_GPU_WITH_ZE_RT
+    case engine_types::ze:
+        ret = ze::create_ze_engine(device, runtime_type);
+        break;
+#endif
     default:
         throw std::runtime_error("Invalid engine type");
     }
diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp
index f69a3124da7d6d..a720ee7ec4ea31 100644
--- a/src/plugins/intel_gpu/src/runtime/memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory.cpp
@@ -50,6 +50,8 @@ std::unique_ptr<surfaces_lock> surfaces_lock::create(engine_types engine_type, s
     case engine_types::sycl:
     case engine_types::ocl:
         return std::unique_ptr<ocl::ocl_surfaces_lock>(new ocl::ocl_surfaces_lock(mem, stream));
+    case engine_types::ze:
+        return nullptr; // TODO: implement once we have support for surface sharing
     default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create");
     }
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index 11424c180d4122..e17db33c52e77c 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -341,6 +341,13 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
         info.num_ccs = std::max<uint32_t>(num_queues, info.num_ccs);
     }
 
+    info.supports_mutable_command_list = false;
+
+    // Not supported
+    info.timer_resolution = 0;
+    info.kernel_timestamp_valid_bits = 0;
+    info.compute_queue_group_ordinal = 0;
+    info.device_memory_ordinal = 0;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     using namespace dnnl::impl::gpu::intel::jit;
@@ -423,36 +430,7 @@ bool ocl_device::is_same(const device::ptr other) {
     // Short path if cl_device is the same
     if (_platform == casted->_platform && _device.get() && casted->_device.get() && _device == casted->_device)
         return true;
-
-    // Relying solely on the UUID is not reliable in all the cases (particularly on legacy platforms),
-    // where the UUID may be missing or incorrectly generated
-    // Therefore, we also validate other attributes
-    if (_info.uuid.uuid != casted->_info.uuid.uuid)
-        return false;
-
-    if (_info.pci_info != casted->_info.pci_info)
-        return false;
-
-    if (_info.sub_device_idx != casted->_info.sub_device_idx)
-        return false;
-
-    if (_info.vendor_id != casted->_info.vendor_id ||
-        _info.dev_name != casted->_info.dev_name ||
-        _info.driver_version != casted->_info.driver_version)
-        return false;
-
-    if (_info.dev_type != casted->_info.dev_type ||
-        _info.gfx_ver != casted->_info.gfx_ver ||
-        _info.arch != casted->_info.arch)
-        return false;
-
-    if (_info.ip_version != casted->_info.ip_version || _info.device_id != casted->_info.device_id)
-        return false;
-
-    if (_info.execution_units_count != casted->_info.execution_units_count || _info.max_global_mem_size != casted->_info.max_global_mem_size)
-        return false;
-
-    return true;
+    return _info.is_same_device(casted->_info);
 }
 
 void ocl_device::set_mem_caps(const memory_capabilities& memory_capabilities) {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp
index 83826e0a5d5e6f..c102beb8092647 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp
@@ -69,20 +69,6 @@ bool does_device_match_config(const cl::Device& device) {
     return true;
 }
 
-// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration
-// Lower priority value means lower device ID
-// Current behavior is: Intel iGPU < Intel dGPU < any other GPU
-// Order of Intel dGPUs is undefined and depends on the OCL impl
-// Order of other vendor GPUs is undefined and depends on the OCL impl
-size_t get_device_priority(const cldnn::device_info& info) {
-    if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) {
-        return 0;
-    } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) {
-        return 1;
-    } else {
-        return std::numeric_limits<size_t>::max();
-    }
-}
 }  // namespace
 
 namespace cldnn {
@@ -133,15 +119,6 @@ static std::vector<cl::Device> getSubDevices(cl::Device& rootDevice) {
     return subDevices;
 }
 
-std::vector<device::ptr> ocl_device_detector::sort_devices(const std::vector<device::ptr>& devices_list) {
-    std::vector<device::ptr> sorted_list = devices_list;
-    std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1,  device::ptr d2) {
-        return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info());
-    });
-
-    return sorted_list;
-}
-
 std::map<std::string, device::ptr> ocl_device_detector::get_available_devices(void* user_context,
                                                                               void* user_device,
                                                                               int ctx_device_id,
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp
index 121fc025098b08..0992d734bf512b 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp
@@ -25,8 +25,6 @@ class ocl_device_detector {
                                                              int target_tile_id = -1,
                                                              bool initialize_devices = false) const;
 
-    static std::vector<device::ptr> sort_devices(const std::vector<device::ptr>& devices_list);
-
 private:
     std::vector<device::ptr> create_device_list() const;
     std::vector<device::ptr> create_device_list_from_user_context(void* user_context, int ctx_device_id = 0) const;
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp
new file mode 100644
index 00000000000000..417a0286df7252
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ocl_kernel.hpp"
+#include <vector>
+
+namespace cldnn {
+namespace ocl {
+
+std::vector<uint8_t> ocl_kernel::get_binary() const {
+    // Get the corresponding program object for the kernel
+    cl_program program;
+    cl_int error = clGetKernelInfo(_compiled_kernel.get(), CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
+    if (error) {
+        throw std::runtime_error("Failed to retrieve CL_KERNEL_PROGRAM: " + std::to_string(error));
+    }
+
+    // Get the size of the program binary in bytes.
+    size_t binary_size = 0;
+    error = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, nullptr);
+    if (error) {
+        throw std::runtime_error("Failed to retrieve CL_PROGRAM_BINARY_SIZES: " + std::to_string(error));
+    }
+
+    // Binary is not available for the device.
+    if (binary_size == 0)
+        throw std::runtime_error("get_binary: Binary size is zero");
+
+    // Get program binary.
+    std::vector<uint8_t> binary(binary_size);
+    uint8_t* binary_buffer = binary.data();
+    error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary_size, &binary_buffer, nullptr);
+    if (error) {
+        throw std::runtime_error("Failed to retrieve CL_PROGRAM_BINARIES: " + std::to_string(error));
+    }
+
+    return binary;
+}
+
+}  // namespace ocl
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
index 206db55057cf5e..035182f664df4d 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
@@ -33,6 +33,8 @@ class ocl_kernel : public kernel {
 
         return std::make_shared<ocl_kernel>(get_handle().clone(), _kernel_id);
     }
+
+    std::vector<uint8_t> get_binary() const override;
 };
 
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp
new file mode 100644
index 00000000000000..51c69202678fba
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/event.hpp"
+#include <ze_api.h>
+
+namespace cldnn {
+namespace ze {
+
+struct ze_base_event : public event {
+public:
+    explicit ze_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { }
+    uint64_t get_queue_stamp() const { return _queue_stamp; }
+    void set_queue_stamp(uint64_t val) { _queue_stamp = val; }
+    virtual ze_event_handle_t get() = 0;
+
+protected:
+    uint64_t _queue_stamp = 0;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp
new file mode 100644
index 00000000000000..fddcda042d7fa0
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_common.hpp"
+#include "openvino/core/except.hpp"
+
+#if defined(__linux__)
+#include <dlfcn.h>
+#elif defined(_WIN32)
+#include "windows.h"
+#else
+#error "Level Zero is supported on Linux and Windows only"
+#endif
+
+namespace cldnn {
+namespace ze {
+
+void *find_ze_symbol(const char *symbol) {
+#if defined(__linux__)
+    void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
+#elif defined(_WIN32)
+    HMODULE handle = LoadLibraryA("ze_loader.dll");
+#endif
+    if (!handle) {
+        return nullptr;
+    }
+
+#if defined(__linux__)
+    void *f = dlsym(handle, symbol);
+#elif defined(_WIN32)
+    void *f = GetProcAddress(handle, symbol);
+#endif
+    OPENVINO_ASSERT(f != nullptr);
+    return f;
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
new file mode 100644
index 00000000000000..562167458288f3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+
+#include <ze_api.h>
+
+#include <limits>
+
+#define ZE_CHECK(f) \
+    do { \
+        ze_result_t res_ = (f); \
+        if (res_ != ZE_RESULT_SUCCESS) { \
+            throw std::runtime_error(#f " command failed with code " + std::to_string(res_)); \
+        } \
+    } while (false)
+
+
+namespace cldnn {
+namespace ze {
+
+static constexpr uint64_t default_timeout = std::numeric_limits<uint64_t>::max();
+
+void* find_ze_symbol(const char *symbol);
+
+template <typename F>
+F find_ze_symbol(const char *symbol) {
+    return (F)find_ze_symbol(symbol);
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
new file mode 100644
index 00000000000000..c4fa62668675fe
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -0,0 +1,273 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_device.hpp"
+#include "ze_common.hpp"
+
+#include <ze_api.h>
+#include <vector>
+#include <algorithm>
+#include <cassert>
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <SetupAPI.h>
+#include <devguid.h>
+#include <cstring>
+#else
+#include <unistd.h>
+#include <limits.h>
+#include <link.h>
+#include <dlfcn.h>
+#endif
+
+namespace cldnn {
+namespace ze {
+
+namespace {
+
+bool supports_extension(const std::vector<ze_driver_extension_properties_t>& extensions, const std::string& ext_name, uint32_t ext_ver) {
+    return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) {
+        return std::string(ep.name) == ext_name && ep.version == ext_ver;
+    }) != extensions.end();
+}
+
+device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t device) {
+    device_info info;
+
+    uint32_t num_ext = 0;
+    ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, nullptr));
+
+    std::vector<ze_driver_extension_properties_t> extensions(num_ext);
+    ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0]));
+
+    ze_driver_properties_t driver_properties{ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES};
+    ZE_CHECK(zeDriverGetProperties(driver, &driver_properties));
+
+    bool supports_luid = supports_extension(extensions, ZE_DEVICE_LUID_EXT_NAME, ZE_DEVICE_LUID_EXT_VERSION_1_0);
+    bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0);
+    bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0);
+    bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0);
+
+    ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0};
+    ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr};
+    ZE_CHECK(zeDeviceGetProperties(device, &device_properties));
+
+    ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES};
+    ZE_CHECK(zeDeviceGetComputeProperties(device, &device_compute_properties));
+
+    uint32_t queue_properties_count = 0;
+    ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr));
+
+    std::vector<ze_command_queue_group_properties_t> queue_properties(queue_properties_count);
+    for (auto& mp : queue_properties) {
+        mp.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
+    }
+
+    ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0]));
+
+    auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) {
+        return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == true;
+    });
+
+    OPENVINO_ASSERT(compute_queue_props != queue_properties.end());
+
+    uint32_t memory_properties_count = 0;
+    ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr));
+
+    std::vector<ze_device_memory_properties_t> device_memory_properties(memory_properties_count);
+    for (auto& mp : device_memory_properties) {
+        mp.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+    }
+    ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0]));
+
+    ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES};
+    ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties));
+
+    auto mem_properties = std::find_if(device_memory_properties.begin(), device_memory_properties.end(), [](const ze_device_memory_properties_t& p) {
+        auto name = std::string(p.name);
+        return name == "DDR" || name == "HBM";
+    });
+
+    ze_device_module_properties_t device_module_properties{ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES};
+    ZE_CHECK(zeDeviceGetModuleProperties(device, &device_module_properties));
+
+    ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES};
+    ZE_CHECK(zeDeviceGetImageProperties(device, &device_image_properties));
+
+    info.vendor_id = device_properties.vendorId;
+    info.dev_name = device_properties.name;
+    info.driver_version = std::to_string(driver_properties.driverVersion);
+    info.dev_type = (device_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? device_type::integrated_gpu : device_type::discrete_gpu;
+
+    info.execution_units_count = device_properties.numEUsPerSubslice * device_properties.numSubslicesPerSlice * device_properties.numSlices;
+
+    info.gpu_frequency = device_properties.coreClockRate;
+
+    info.supported_simd_sizes = {};
+    info.has_separate_cache = true;
+
+    info.max_work_group_size = device_compute_properties.maxTotalGroupSize;
+    info.max_local_mem_size = device_compute_properties.maxSharedLocalMemory;
+
+    if (mem_properties != device_memory_properties.end()) {
+        info.max_global_mem_size = mem_properties->totalSize;
+        info.device_memory_ordinal = std::distance(device_memory_properties.begin(), mem_properties);
+    } else {
+        info.max_global_mem_size = 0;
+        info.device_memory_ordinal = 0;
+    }
+
+    info.max_alloc_mem_size = device_properties.maxMemAllocSize;
+
+    info.supports_image = device_image_properties.maxSamplers > 0;
+    info.supports_intel_planar_yuv = false;
+    info.max_image2d_width = device_image_properties.maxImageDims2D;
+    info.max_image2d_height = device_image_properties.maxImageDims2D;
+
+    info.supports_fp16 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP16) != 0;
+    info.supports_fp64 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP64) != 0;
+    info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0;
+
+    info.supports_khr_subgroups = true;
+    info.supports_intel_subgroups = true;
+    info.supports_intel_subgroups_short = true;
+    info.supports_intel_subgroups_char = true;
+    info.supports_intel_required_subgroup_size = true;
+
+    info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0;
+    info.supports_immad = false; // FIXME
+
+    info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities;
+
+    info.gfx_ver = {0, 0, 0}; // could find how to retrieve this from L0 so far
+    info.arch = gpu_arch::unknown;
+    info.ip_version = ip_version_properties.ipVersion;
+    info.sub_device_idx = (std::numeric_limits<uint32_t>::max)();
+
+    info.device_id = device_properties.deviceId;
+    info.num_slices = device_properties.numSlices;
+    info.num_sub_slices_per_slice = device_properties.numSubslicesPerSlice;
+    info.num_eus_per_sub_slice = device_properties.numEUsPerSubslice;
+    info.num_threads_per_eu = device_properties.numThreadsPerEU;
+
+    info.num_ccs = compute_queue_props->numQueues;
+    info.supports_queue_families = true;
+
+    info.kernel_timestamp_valid_bits  = device_properties.kernelTimestampValidBits;
+    info.timer_resolution  = device_properties.timerResolution;
+    info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props);
+
+    static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, "");
+    static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, "");
+    std::copy_n(&device_properties.uuid.id[0], ZE_MAX_DEVICE_UUID_SIZE, info.uuid.uuid.begin());
+
+    if (supports_luid) {
+        ze_device_luid_ext_properties_t luid_props{ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES, nullptr};
+        ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &luid_props};
+        if (zeDeviceGetProperties(device, &device_properties) == ZE_RESULT_SUCCESS)
+            std::copy_n(&luid_props.luid.id[0], ZE_MAX_DEVICE_LUID_SIZE_EXT, info.luid.luid.begin());
+    }
+
+    info.supports_mutable_command_list = false;
+
+    if (supports_mutable_list) {
+        ze_mutable_command_list_exp_properties_t mutable_list_props = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_PROPERTIES,  nullptr, 0, 0 };
+        ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &mutable_list_props};
+        if (zeDeviceGetProperties(device, &device_properties) == ZE_RESULT_SUCCESS) {
+            ze_mutable_command_exp_flags_t required_features = ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION |
+                                                               ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS |
+                                                               ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT |
+                                                               ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE |
+                                                               ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET |
+                                                               ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT |
+                                                               ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS;
+
+            info.supports_mutable_command_list = (mutable_list_props.mutableCommandFlags & required_features) == required_features;
+        }
+    }
+    if (supports_pci_properties) {
+        ze_pci_ext_properties_t pci_properties{ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES, nullptr};
+        if (zeDevicePciGetPropertiesExt(device, &pci_properties) == ZE_RESULT_SUCCESS) {
+            info.pci_info.pci_bus = pci_properties.address.bus;
+            info.pci_info.pci_device = pci_properties.address.device;
+            info.pci_info.pci_domain = pci_properties.address.domain;
+            info.pci_info.pci_function = pci_properties.address.function;
+        }
+    }
+
+    return info;
+}
+
+memory_capabilities init_memory_caps(ze_device_handle_t device, const device_info& info) {
+    std::vector<allocation_type> memory_caps;
+
+    ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES};
+    ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties));
+
+    if (info.supports_usm) {
+        if (device_memory_access_properties.hostAllocCapabilities) {
+            memory_caps.push_back(allocation_type::usm_host);
+        }
+        if (device_memory_access_properties.sharedSingleDeviceAllocCapabilities) {
+            memory_caps.push_back(allocation_type::usm_shared);
+        }
+        if (device_memory_access_properties.deviceAllocCapabilities) {
+            memory_caps.push_back(allocation_type::usm_device);
+        }
+    }
+
+    return memory_capabilities(memory_caps);
+}
+
+}  // namespace
+
+
+ze_device::ze_device(ze_driver_handle_t driver, ze_device_handle_t device, bool initialize)
+: _driver(driver)
+, _device(device)
+, _info(init_device_info(driver, device))
+, _mem_caps(init_memory_caps(device, _info)) {
+    if (initialize) {
+        this->initialize();
+    }
+}
+
+void ze_device::initialize() {
+    if (_is_initialized)
+        return;
+
+    ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 };
+    ZE_CHECK(zeContextCreate(_driver, &context_desc, &_context));
+    _is_initialized = true;
+}
+
+bool ze_device::is_initialized() const {
+    return _is_initialized;
+}
+
+bool ze_device::is_same(const device::ptr other) {
+    auto casted = downcast<ze_device>(other.get());
+    if (!casted)
+        return false;
+
+    if (is_initialized() && casted->is_initialized()) {
+        // Do not compare contexts as one driver can have many different contexts
+        return _device == casted->get_device() && _driver == casted->get_driver();
+    }
+    return _info.is_same_device(casted->_info);
+}
+
+void ze_device::set_mem_caps(const memory_capabilities& memory_capabilities) {
+    _mem_caps = memory_capabilities;
+}
+
+ze_device::~ze_device() {
+    if (_is_initialized)
+        zeContextDestroy(_context);
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp
new file mode 100644
index 00000000000000..1a20685ed2cc77
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/device.hpp"
+#include <ze_api.h>
+
+namespace cldnn {
+namespace ze {
+
+struct ze_device : public device {
+public:
+    ze_device(ze_driver_handle_t driver, ze_device_handle_t device, bool initialize = true);
+
+    const device_info& get_info() const override { return _info; }
+    memory_capabilities get_mem_caps() const override { return _mem_caps; }
+
+    void initialize() override;
+    bool is_initialized() const override;
+
+    const ze_driver_handle_t get_driver() const { return _driver; }
+    const ze_device_handle_t get_device() const { return _device; }
+    const ze_context_handle_t get_context() const { return _context; }
+
+    bool is_same(const device::ptr other) override;
+    void set_mem_caps(const memory_capabilities& memory_capabilities) override;
+
+    ~ze_device();
+
+private:
+    ze_driver_handle_t _driver = nullptr;
+    ze_device_handle_t _device = nullptr;
+    ze_context_handle_t _context = nullptr;
+    bool _is_initialized = false;
+
+    device_info _info;
+    memory_capabilities _mem_caps;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp
new file mode 100644
index 00000000000000..4fede32a322a73
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp
@@ -0,0 +1,118 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_device_detector.hpp"
+#include "ze_device.hpp"
+#include "ze_common.hpp"
+#include <ze_api.h>
+#include "intel_gpu/runtime/debug_configuration.hpp"
+#include "openvino/core/except.hpp"
+
+#include <vector>
+
+namespace cldnn {
+namespace ze {
+
+static std::vector<ze_device_handle_t> get_sub_devices(ze_device_handle_t root_device) {
+    uint32_t n_subdevices = 0;
+    ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr));
+    if (n_subdevices == 0)
+        return {};
+
+    std::vector<ze_device_handle_t> subdevices(n_subdevices);
+
+    ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0]));
+
+    return subdevices;
+}
+
+std::map<std::string, device::ptr> ze_device_detector::get_available_devices(void* user_context,
+                                                                             void* user_device,
+                                                                             int ctx_device_id,
+                                                                             int target_tile_id,
+                                                                             bool initialize_devices) const {
+    std::vector<device::ptr> devices_list;
+    if (user_context != nullptr) {
+        devices_list = create_device_list_from_user_context(user_context, ctx_device_id);
+    } else if (user_device != nullptr) {
+        devices_list = create_device_list_from_user_device(user_device);
+    } else {
+        devices_list = create_device_list(initialize_devices);
+    }
+
+    devices_list = sort_devices(devices_list);
+
+    std::map<std::string, device::ptr> ret;
+    uint32_t idx = 0;
+    for (auto& dptr : devices_list) {
+        auto map_id = std::to_string(idx++);
+        ret[map_id] = dptr;
+
+        auto root_device = std::dynamic_pointer_cast<ze_device>(dptr);
+        OPENVINO_ASSERT(root_device != nullptr, "[GPU] Invalid device type created in ocl_device_detector");
+
+        auto sub_devices = get_sub_devices(root_device->get_device());
+        if (!sub_devices.empty()) {
+            uint32_t sub_idx = 0;
+            for (auto& sub_device : sub_devices) {
+                if (target_tile_id != -1 && static_cast<int>(sub_idx) != target_tile_id) {
+                    sub_idx++;
+                    continue;
+                }
+                auto sub_device_ptr = std::make_shared<ze_device>(root_device->get_driver(), sub_device, initialize_devices);
+                ret[map_id + "." + std::to_string(sub_idx++)] = sub_device_ptr;
+            }
+        }
+    }
+
+    return ret;
+}
+
+std::vector<device::ptr> ze_device_detector::create_device_list(bool initialize_devices) const {
+    std::vector<device::ptr> ret;
+
+    ZE_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY));
+
+    uint32_t driver_count = 0;
+    ZE_CHECK(zeDriverGet(&driver_count, nullptr));
+
+    std::vector<ze_driver_handle_t> all_drivers(driver_count);
+    ZE_CHECK(zeDriverGet(&driver_count, &all_drivers[0]));
+
+    for (uint32_t i = 0; i < driver_count; ++i) {
+        uint32_t device_count = 0;
+        ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, nullptr));
+
+        std::vector<ze_device_handle_t> all_devices(device_count);
+        ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0]));
+
+        for (uint32_t d = 0; d < device_count; ++d) {
+            try {
+                ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
+                ZE_CHECK(zeDeviceGetProperties(all_devices[d], &device_properties));
+
+                if (ZE_DEVICE_TYPE_GPU == device_properties.type) {
+                    ret.emplace_back(std::make_shared<ze_device>(all_drivers[i], all_devices[d], initialize_devices));
+                }
+            } catch (std::exception& ex) {
+                GPU_DEBUG_LOG << "Devices query/creation failed for driver " << i << ex.what() << std::endl;
+                GPU_DEBUG_LOG << "Platform is skipped" << std::endl;
+                continue;
+            }
+        }
+    }
+
+    return ret;
+}
+
+std::vector<device::ptr> ze_device_detector::create_device_list_from_user_context(void* user_context, int ctx_device_id) const {
+    OPENVINO_NOT_IMPLEMENTED;
+}
+
+std::vector<device::ptr> ze_device_detector::create_device_list_from_user_device(void* user_device) const {
+    OPENVINO_NOT_IMPLEMENTED;
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp
new file mode 100644
index 00000000000000..deeefb36234a59
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/device.hpp"
+
+#include <string>
+#include <vector>
+#include <map>
+
+namespace cldnn {
+namespace ze {
+
+class ze_device_detector {
+public:
+    ze_device_detector() = default;
+
+    std::map<std::string, device::ptr> get_available_devices(void* user_context,
+                                                             void* user_device,
+                                                             int ctx_device_id,
+                                                             int target_tile_id,
+                                                             bool initialize_devices = false) const;
+private:
+    std::vector<device::ptr> create_device_list(bool initialize_devices) const;
+    std::vector<device::ptr> create_device_list_from_user_context(void* user_context, int ctx_device_id = 0) const;
+    std::vector<device::ptr> create_device_list_from_user_device(void* user_device) const;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
new file mode 100644
index 00000000000000..a493ac23005ab0
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -0,0 +1,266 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_engine.hpp"
+#include "intel_gpu/runtime/utils.hpp"
+#include "openvino/core/except.hpp"
+#include "ze/ze_kernel.hpp"
+#include "ze_api.h"
+#include "ze_engine_factory.hpp"
+#include "ze_common.hpp"
+#include "ze_memory.hpp"
+#include "ze_stream.hpp"
+#include "ze_device.hpp"
+#include <exception>
+#include <vector>
+#include <memory>
+#include <stdexcept>
+
+
+namespace cldnn {
+namespace ze {
+
+namespace {
+
+void func_zeModuleCreate(ze_context_handle_t hContext,
+                         ze_device_handle_t hDevice, const ze_module_desc_t *desc,
+                         ze_module_handle_t *phModule,
+                         ze_module_build_log_handle_t *phBuildLog) {
+    static auto f = find_ze_symbol<decltype(&zeModuleCreate)>("zeModuleCreate");
+
+    if (!f)
+        throw std::runtime_error("zeModuleCreate was not found");
+    ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog));
+}
+
+ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& engine, std::vector<uint8_t> binary) {
+    auto desc = ze_module_desc_t();
+    desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+    desc.format = ZE_MODULE_FORMAT_NATIVE;
+    desc.inputSize = binary.size();
+    desc.pInputModule = binary.data();
+    desc.pBuildFlags = "";
+    desc.pConstants = nullptr;
+
+    ze_module_handle_t ze_module;
+
+    auto ze_device = engine.get_device();
+    auto ze_ctx = engine.get_context();
+    func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr);
+    return ze_module;
+}
+
+}  // namespace
+
+ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type)
+    : engine(dev) {
+    OPENVINO_ASSERT(runtime_type == runtime_types::ze, "[GPU] Invalid runtime type specified for ZE engine. Only ZE runtime is supported");
+
+    auto casted = dynamic_cast<ze_device*>(dev.get());
+    OPENVINO_ASSERT(casted, "[GPU] Invalid device type passed to ze engine");
+
+    _service_stream.reset(new ze_stream(*this, ExecutionConfig()));
+}
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+void ze_engine::create_onednn_engine(const ExecutionConfig& config) {
+    OPENVINO_NOT_IMPLEMENTED;
+}
+
+dnnl::engine& ze_engine::get_onednn_engine() const {
+    OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called");
+    return *_onednn_engine;
+}
+#endif
+
+const ze_driver_handle_t ze_engine::get_driver() const {
+    auto casted = std::dynamic_pointer_cast<ze_device>(_device);
+    OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine");
+    return casted->get_driver();
+}
+
+const ze_context_handle_t ze_engine::get_context() const {
+    auto casted = std::dynamic_pointer_cast<ze_device>(_device);
+    OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine");
+    return casted->get_context();
+}
+
+const ze_device_handle_t ze_engine::get_device() const {
+    auto casted = std::dynamic_pointer_cast<ze_device>(_device);
+    OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine");
+    return casted->get_device();
+}
+
+allocation_type ze_engine::detect_usm_allocation_type(const void* memory) const {
+    return ze::gpu_usm::detect_allocation_type(this, memory);
+}
+
+bool ze_engine::check_allocatable(const layout& layout, allocation_type type) {
+    OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type);
+
+    bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
+
+    // When dynamic shape upper bound makes bigger buffer, then return false.
+    if (exceed_allocatable_mem_size && layout.is_dynamic()) {
+        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
+        return false;
+    }
+
+    OPENVINO_ASSERT(!exceed_allocatable_mem_size,
+                    "[GPU] Exceeded max size of memory object allocation: ",
+                    "requested ", layout.bytes_count(), " bytes, "
+                    "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
+                    "Please try to reduce batch size or use lower precision.");
+
+    auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host);
+    auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size());
+
+    // When dynamic shape upper bound makes bigger buffer, then return false.
+    if (exceed_available_mem_size && layout.is_dynamic()) {
+        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
+        return false;
+    }
+
+#ifdef __unix__
+    // Prevent from being killed by Ooo Killer of Linux
+    OPENVINO_ASSERT(!exceed_available_mem_size,
+                    "[GPU] Exceeded max size of memory allocation: ",
+                    "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ",
+                    "but available memory size is ", get_max_memory_size(), " bytes");
+#else
+    if (exceed_available_mem_size) {
+        GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : "
+                       << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl;
+        GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl;
+        return false;
+    }
+#endif
+
+    return true;
+}
+
+memory::ptr ze_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
+    OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout");
+
+    check_allocatable(layout, type);
+
+    try {
+        memory::ptr res = std::make_shared<ze::gpu_usm>(this, layout, type);
+
+        if (reset || res->is_memory_reset_needed(layout)) {
+            auto ev = res->fill(get_service_stream());
+            if (ev) {
+                get_service_stream().wait_for_events({ev});
+            }
+        }
+
+        return res;
+    } catch (const std::exception& e) {
+        OPENVINO_THROW("[GPU] Failed to allocate memory: ", e.what());
+    }
+}
+
+memory::ptr ze_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) {
+    OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine");
+    OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(),
+                    "[GPU] trying to reinterpret between image and non-image layouts. Current: ",
+                    memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string());
+
+    if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
+            return std::make_shared<ze::gpu_usm>(this,
+                                     new_layout,
+                                     reinterpret_cast<const ze::gpu_usm&>(memory).get_buffer(),
+                                     memory.get_allocation_type(),
+                                     memory.get_mem_tracker());
+    }
+
+    return nullptr;
+}
+
+memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_params params) {
+    if (params.mem_type == shared_mem_type::shared_mem_usm) {
+        ze::UsmMemory usm_buffer(get_context(), get_device(), params.mem);
+        size_t actual_mem_size = 0;
+        zeMemGetAddressRange(get_context(), params.mem, nullptr, &actual_mem_size);
+        auto requested_mem_size = new_layout.bytes_count();
+        OPENVINO_ASSERT(actual_mem_size >= requested_mem_size,
+                            "[GPU] shared USM buffer has smaller size (", actual_mem_size,
+                            ") than specified layout (", requested_mem_size, ")");
+        return std::make_shared<ze::gpu_usm>(this, new_layout, usm_buffer, nullptr);
+    } else {
+        return nullptr;
+    }
+}
+
+memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) {
+    OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] Trying to create a subbuffer from a buffer allocated by a different engine");
+    if (new_layout.format.is_image_2d()) {
+        OPENVINO_NOT_IMPLEMENTED;
+    }
+    if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
+        auto& new_buf = reinterpret_cast<const ze::gpu_usm&>(memory);
+        auto ptr = new_buf.get_buffer().get();
+        auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset);
+        return std::make_shared<ze::gpu_usm>(this,
+                                 new_layout,
+                                 sub_buffer,
+                                 memory.get_allocation_type(),
+                                 memory.get_mem_tracker());
+    } else {
+        OPENVINO_THROW("[GPU] Trying to create subbuffer for non usm memory");
+    }
+}
+
+bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) {
+    if (mem1.get_engine() != this || mem2.get_engine() != this)
+        return false;
+    if (mem1.get_allocation_type() != mem2.get_allocation_type())
+        return false;
+    if (&mem1 == &mem2)
+        return true;
+
+    return (reinterpret_cast<const ze::gpu_usm&>(mem1).get_buffer().get() == reinterpret_cast<const ze::gpu_usm&>(mem2).get_buffer().get());
+}
+
+kernel::ptr ze_engine::prepare_kernel(const kernel::ptr kernel) const {
+    if (std::dynamic_pointer_cast<const ze_kernel>(kernel)) {
+        return kernel;
+    } else {
+        auto binary = kernel->get_binary();
+        ze_module_handle_t ze_module = ze_create_module_with_level_zero(*this, binary);
+        ze_kernel_handle_t ze_kernel;
+        auto entry_point = kernel->get_id();
+        ze_kernel_desc_t desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC , nullptr, 0, entry_point.c_str()};
+        zeKernelCreate(ze_module, &desc, &ze_kernel);
+        return std::make_shared<cldnn::ze::ze_kernel>(ze_kernel, ze_module, entry_point);
+    }
+}
+
+void* ze_engine::get_user_context() const {
+    auto& casted = downcast<ze_device>(*_device);
+    return static_cast<void*>(casted.get_driver());
+}
+
+stream::ptr ze_engine::create_stream(const ExecutionConfig& config) const {
+    return std::make_shared<ze_stream>(*this, config);
+}
+
+stream::ptr ze_engine::create_stream(const ExecutionConfig& config, void* handle) const {
+    OPENVINO_NOT_IMPLEMENTED;
+}
+
+stream& ze_engine::get_service_stream() const {
+    return *_service_stream;
+}
+
+std::shared_ptr<cldnn::engine> ze_engine::create(const device::ptr device, runtime_types runtime_type) {
+    return std::make_shared<ze::ze_engine>(device, runtime_type);
+}
+
+std::shared_ptr<cldnn::engine> create_ze_engine(const device::ptr device, runtime_types runtime_type) {
+    return ze_engine::create(device, runtime_type);
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
new file mode 100644
index 00000000000000..b75d2ae0ca67eb
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ze_api.h>
+#include "intel_gpu/runtime/memory.hpp"
+#include "intel_gpu/runtime/engine.hpp"
+#include "intel_gpu/runtime/stream.hpp"
+#include "intel_gpu/runtime/device.hpp"
+
+#include <memory>
+
+namespace cldnn {
+namespace ze {
+
+class ze_engine : public engine {
+public:
+    ze_engine(const device::ptr dev, runtime_types runtime_type);
+    engine_types type() const override { return engine_types::ze; };
+    runtime_types runtime_type() const override { return runtime_types::ze; };
+
+    memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override;
+    memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override;
+    memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) override;
+    memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
+    bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
+    bool check_allocatable(const layout& layout, allocation_type type) override;
+
+    void* get_user_context() const override;
+
+    allocation_type get_default_allocation_type() const override { return allocation_type::usm_device; }
+    allocation_type detect_usm_allocation_type(const void* memory) const override;
+
+    const ze_context_handle_t get_context() const;
+    const ze_driver_handle_t get_driver() const;
+    const ze_device_handle_t get_device() const;
+
+    stream_ptr create_stream(const ExecutionConfig& config) const override;
+    stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
+    stream& get_service_stream() const override;
+
+    kernel::ptr prepare_kernel(const kernel::ptr kernel) const override;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    void create_onednn_engine(const ExecutionConfig& config) override;
+    // Returns onednn engine object which shares device and context with current engine
+    dnnl::engine& get_onednn_engine() const override;
+#endif
+
+    static std::shared_ptr<cldnn::engine> create(const device::ptr device, runtime_types runtime_type);
+
+private:
+    std::unique_ptr<stream> _service_stream;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    std::mutex onednn_mutex;
+    std::shared_ptr<dnnl::engine> _onednn_engine;
+#endif
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp
new file mode 100644
index 00000000000000..40c944a1ca7512
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/device.hpp"
+#include "intel_gpu/runtime/engine.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible CL includes conflict
+// between different engines in engine.cpp file
+std::shared_ptr<cldnn::engine> create_ze_engine(const device::ptr device, runtime_types runtime_type);
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
new file mode 100644
index 00000000000000..c5e39be435798f
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
@@ -0,0 +1,216 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_event.hpp"
+#include "ze/ze_common.hpp"
+
+#include <cassert>
+#include <chrono>
+#include <list>
+
+using namespace cldnn;
+using namespace ze;
+
+namespace {
+
+std::chrono::nanoseconds timestamp_to_duration(const device_info& device_info, const ze_kernel_timestamp_data_t& timestamp) {
+    constexpr double NS_IN_SEC = 1000000000.0;
+    const double timestamp_freq = NS_IN_SEC / device_info.timer_resolution;
+    const uint64_t timestamp_max_value = ~(-1L << device_info.kernel_timestamp_valid_bits);
+
+    auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ? (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq
+                                                            : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq;
+
+    return std::chrono::nanoseconds(static_cast<uint64_t>(d));
+}
+
+}  // namespace
+
+void ze_event::wait_impl() {
+    if (m_event != nullptr) {
+        ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout));
+    }
+}
+
+void ze_event::set_impl() {
+    if (m_event != nullptr) {
+        ZE_CHECK(zeEventHostSignal(m_event));
+    }
+}
+
+bool ze_event::is_set_impl() {
+    if (m_event != nullptr) {
+        return zeEventQueryStatus(m_event) == ZE_RESULT_SUCCESS;
+    }
+    return true;
+}
+
+bool ze_event::is_profiled() const {
+    if (m_event != nullptr) {
+        ze_event_pool_flags_t event_pool_flags;
+        auto ev_pool = m_event_pool.get()->m_handle;
+        ZE_CHECK(zeEventPoolGetFlags(ev_pool, &event_pool_flags));
+        return (event_pool_flags & ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) != 0;
+    }
+    return false;
+}
+
+bool ze_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
+    if (!is_profiled()) {
+        return true;
+    }
+
+    const auto& engine = m_event_pool->m_engine;
+    auto device_info = engine.get_device_info();
+
+    ze_kernel_timestamp_result_t timestamp{};
+    ZE_CHECK(zeEventQueryKernelTimestamp(m_event, &timestamp));
+
+    auto wallclock_time = timestamp_to_duration(device_info, timestamp.global);
+    auto exec_time = timestamp_to_duration(device_info, timestamp.context);
+
+    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(timestamp_to_duration(device_info, timestamp.context));
+    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(wallclock_time - exec_time);
+
+    info.push_back({ instrumentation::profiling_stage::executing, period_exec });
+    info.push_back({ instrumentation::profiling_stage::submission, period_submit });
+
+    return true;
+}
+
+void ze_events::wait_impl() {
+    if (_last_ze_event != nullptr) {
+        ZE_CHECK(zeEventHostSynchronize(_last_ze_event, UINT32_MAX));
+    }
+}
+
+void ze_events::set_impl() {
+    wait_impl();
+}
+
+bool ze_events::is_set_impl() {
+    if (_last_ze_event != nullptr) {
+        return zeEventQueryStatus(_last_ze_event) == ZE_RESULT_SUCCESS;
+    }
+    return true;
+}
+
+bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
+    // The goal is to sum up all disjoint durations of its projection on the time axis
+    std::vector<ze_kernel_timestamp_data_t> all_global_timestamps;
+    std::vector<ze_kernel_timestamp_data_t> all_context_timestamps;
+
+    auto add_or_merge = [](std::vector<ze_kernel_timestamp_data_t>& all_timestamps, const ze_kernel_timestamp_data_t& ts) {
+        auto it = all_timestamps.begin();
+        bool merged = false;
+        auto target_timestamp = ts;
+        while (it != all_timestamps.end()) {
+            auto& timestamp = *it;
+            bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd;
+            bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart;
+            if (!disjoint) {
+                if (equal) {
+                    if (!merged) {
+                        merged = true;
+                        break;
+                    } else {
+                        it = all_timestamps.erase(it);
+                    }
+                } else {
+                    if (!merged) {
+                        timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart);
+                        timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd);
+                        target_timestamp = timestamp;
+                        merged = true;
+                        it++;
+                    } else {
+                        if (timestamp.kernelEnd > target_timestamp.kernelEnd) {
+                            it--;
+                            it->kernelEnd = target_timestamp.kernelEnd;
+                            it++;
+                        }
+                        it = all_timestamps.erase(it);
+                    }
+                }
+            } else {
+                it++;
+            }
+        }
+
+        if (!merged) {
+            all_timestamps.push_back(target_timestamp);
+        }
+    };
+
+    if (_events.empty())
+        return false;
+
+    const auto& engine = downcast<ze_event>(_events.front().get())->m_event_pool->m_engine;
+    auto device_info = engine.get_device_info();
+
+    auto get_total_exec_time = [&device_info](std::vector<ze_kernel_timestamp_data_t>& all_timestamps) {
+        std::chrono::nanoseconds total_time{0};
+        for (const auto& ts : all_timestamps) {
+            total_time += timestamp_to_duration(device_info, ts);
+        }
+
+        return total_time;
+    };
+
+    // Submission time is calculated as difference between merged context and wallclock intervals
+    // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals
+    using intervals_t = std::vector<ze_kernel_timestamp_data_t>;
+    auto get_submission_time = [&device_info](const intervals_t& s_timestamps,
+                                              const intervals_t& e_timestamps) {
+        auto get_minmax = [](const intervals_t& timestamps) {
+            uint64_t min_val = std::min(timestamps.begin(), timestamps.end(),
+                [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) {
+                    return lhs->kernelStart < rhs->kernelStart;
+            })->kernelStart;
+            uint64_t max_val = std::max(timestamps.begin(), timestamps.end(),
+                [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) {
+                    return lhs->kernelEnd < rhs->kernelEnd;
+            })->kernelEnd;
+
+            return ze_kernel_timestamp_data_t{min_val, max_val};
+        };
+
+        auto submission_interval = get_minmax(s_timestamps);
+        auto exec_interval = get_minmax(e_timestamps);
+
+        auto wallclock_time = timestamp_to_duration(device_info, submission_interval);
+        auto exec_time = timestamp_to_duration(device_info, exec_interval);
+
+        return wallclock_time - exec_time;
+    };
+
+    for (size_t i = 0; i < _events.size(); i++) {
+        auto be = downcast<ze_event>(_events[i].get());
+        if (!be->is_profiled()) {
+            continue;
+        }
+        ze_kernel_timestamp_result_t timestamp{};
+        ZE_CHECK(zeEventQueryKernelTimestamp(be->get(), &timestamp));
+
+        add_or_merge(all_global_timestamps, timestamp.global);
+        add_or_merge(all_context_timestamps, timestamp.context);
+    }
+
+    auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps);
+    auto exec_time = get_total_exec_time(all_context_timestamps);
+
+    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(exec_time);
+    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(submit_time);
+
+    info.push_back({ instrumentation::profiling_stage::executing, period_exec });
+    info.push_back({ instrumentation::profiling_stage::submission, period_submit });
+
+    return true;
+}
+
+ze_event::~ze_event() {
+    if (m_event != nullptr) {
+        zeEventDestroy(m_event);
+    }
+}
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
new file mode 100644
index 00000000000000..5120cf9120ec29
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_base_event.hpp"
+#include "ze_event_pool.hpp"
+
+#include <vector>
+#include <list>
+
+namespace cldnn {
+namespace ze {
+
+struct ze_event : public ze_base_event {
+public:
+    ze_event(ze_event_pool::ptr ev_pool, ze_event_handle_t ev, uint64_t queue_stamp = 0)
+        : ze_base_event(queue_stamp)
+        , m_event_pool(ev_pool)
+        , m_event(ev) {}
+
+    ze_event_handle_t get() override { return m_event; }
+    bool is_profiled() const;
+
+    ~ze_event();
+
+private:
+    void wait_impl() override;
+    void set_impl() override;
+    bool is_set_impl() override;
+    bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
+
+    friend struct ze_events;
+
+protected:
+    ze_event_pool::ptr m_event_pool;
+    ze_event_handle_t m_event;
+};
+
+struct ze_events : public ze_base_event {
+public:
+    ze_events(std::vector<event::ptr> const& ev)
+        : ze_base_event(0) {
+        process_events(ev);
+    }
+
+    ze_event_handle_t get() override { return _last_ze_event; }
+
+    void reset() override {
+        event::reset();
+        _events.clear();
+    }
+
+private:
+    void wait_impl() override;
+    void set_impl() override;
+    bool is_set_impl() override;
+
+    void process_events(const std::vector<event::ptr>& ev) {
+        for (size_t i = 0; i < ev.size(); i++) {
+            auto multiple_events = dynamic_cast<ze_events*>(ev[i].get());
+            if (multiple_events) {
+                for (size_t j = 0; j < multiple_events->_events.size(); j++) {
+                    if (auto base_ev = dynamic_cast<ze_event*>(multiple_events->_events[j].get())) {
+                        auto current_ev_queue_stamp = base_ev->get_queue_stamp();
+                        if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
+                            _queue_stamp = current_ev_queue_stamp;
+                            _last_ze_event = base_ev->get();
+                        }
+                    }
+                    _events.push_back(multiple_events->_events[j]);
+                }
+            } else {
+                if (auto base_ev = dynamic_cast<ze_event*>(ev[i].get())) {
+                    auto current_ev_queue_stamp = base_ev->get_queue_stamp();
+                    if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
+                        _queue_stamp = current_ev_queue_stamp;
+                        _last_ze_event = base_ev->get();
+                    }
+                }
+                _events.push_back(ev[i]);
+            }
+        }
+    }
+
+    bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
+
+    ze_event_handle_t _last_ze_event;
+    std::vector<event::ptr> _events;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
new file mode 100644
index 00000000000000..1ae696e0d10824
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
@@ -0,0 +1,78 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_event_pool.hpp"
+#include "ze_event.hpp"
+#include "ze_common.hpp"
+
+namespace cldnn {
+namespace ze {
+
+ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags)
+    : m_engine(engine) {
+    ze_event_pool_desc_t event_pool_desc = {
+        ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
+        nullptr,
+        flags,
+        capacity
+    };
+    auto device = engine.get_device();
+    ZE_CHECK(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle));
+}
+
+ze_event_pool::~ze_event_pool() {
+    zeEventPoolDestroy(m_handle);
+}
+
+ze_events_pool::ze_events_pool(const ze_engine& engine, bool enable_profiling)
+    : m_engine(engine)
+    , m_enable_profiling(enable_profiling) { }
+
+std::shared_ptr<ze_event> ze_events_pool::create_event(uint64_t queue_stamp) {
+    if (m_num_used >= m_capacity || !m_current_pool) {
+        m_num_used = 0;
+        ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
+        flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+        m_current_pool = std::make_shared<ze_event_pool>(m_engine, m_capacity, flags);
+    }
+
+    ze_event_handle_t event;
+    // set signal scope to host to allow wait on the host
+    // TODO: avoid setting scope when event is not used for wait on the host
+    ze_event_desc_t event_desc = {
+        ZE_STRUCTURE_TYPE_EVENT_DESC,
+        nullptr,
+        m_num_used++,
+        ZE_EVENT_SCOPE_FLAG_HOST,
+        0
+    };
+    ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event));
+
+    return std::make_shared<ze_event>(m_current_pool, event, queue_stamp);
+}
+
+std::shared_ptr<ze_event> ze_events_pool::create_user_event() {
+    if (m_num_used_user >= m_capacity || !m_current_user_pool) {
+        m_num_used_user = 0;
+        ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
+        flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+        m_current_user_pool = std::make_shared<ze_event_pool>(m_engine, m_capacity, flags);
+    }
+    // set signal scope to host to allow wait on the host
+    // TODO: avoid setting scope when event is not used for wait on the host
+    ze_event_handle_t event;
+    ze_event_desc_t event_desc = {
+        ZE_STRUCTURE_TYPE_EVENT_DESC,
+        nullptr,
+        m_num_used_user++,
+        ZE_EVENT_SCOPE_FLAG_HOST,
+        0
+    };
+    ZE_CHECK(zeEventCreate(m_current_user_pool->m_handle, &event_desc, &event));
+
+    return std::make_shared<ze_event>(m_current_user_pool, event);
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
new file mode 100644
index 00000000000000..197e41b862a422
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_engine.hpp"
+
+namespace cldnn {
+namespace ze {
+
+struct ze_event;
+
+// Wrapper for ze events pool which is needed to track lifetime of the pool.
+// I.e. the object is destoyed if no ze_events alive which refer to this pool
+// and ze_events_pool doesn't refer to it as well
+struct ze_event_pool {
+    ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags);
+    ~ze_event_pool();
+    using ptr = std::shared_ptr<ze_event_pool>;
+
+    ze_event_pool_handle_t m_handle;
+    const ze_engine& m_engine;
+};
+
+// Helper for events pool management
+// Can hold multiple ze_event_pool objects and track their capacity with realloc when it's needed
+struct ze_events_pool {
+public:
+    ze_events_pool(const ze_engine& engine, bool enable_profiling);
+
+    std::shared_ptr<ze_event> create_event(uint64_t queue_stamp = 0);
+    std::shared_ptr<ze_event> create_user_event();
+
+protected:
+    const ze_engine& m_engine;
+    std::shared_ptr<ze_event_pool> m_current_user_pool = nullptr;
+    std::shared_ptr<ze_event_pool> m_current_pool = nullptr;
+    const uint32_t m_capacity = 100;
+    uint32_t m_num_used = 0;
+    uint32_t m_num_used_user = 0;
+    const bool m_enable_profiling;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
new file mode 100644
index 00000000000000..1d9118c4d12d7d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/kernel.hpp"
+#include "openvino/core/except.hpp"
+#include "ze_common.hpp"
+
+#include <memory>
+
+namespace cldnn {
+namespace ze {
+
+class ze_kernel : public kernel {
+    ze_kernel_handle_t _compiled_kernel;
+    ze_module_handle_t _module;
+    std::string _kernel_id;
+
+public:
+    ze_kernel(ze_kernel_handle_t compiled_kernel, ze_module_handle_t module, const std::string& kernel_id)
+        : _compiled_kernel(compiled_kernel)
+        , _module(module)
+        , _kernel_id(kernel_id) { }
+
+    ~ze_kernel() {
+        zeKernelDestroy(_compiled_kernel);
+    }
+
+    const ze_kernel_handle_t& get_handle() const { return _compiled_kernel; }
+    ze_kernel_handle_t& get_handle() { return _compiled_kernel; }
+    std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const override {
+        if (reuse_kernel_handle) {
+            return std::make_shared<ze_kernel>(_compiled_kernel, _module, _kernel_id);
+        } else {
+            ze_kernel_handle_t cloned_handle;
+            ze_kernel_desc_t descriptor;
+            descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
+            descriptor.pNext = nullptr;
+            descriptor.flags = 0;
+            descriptor.pKernelName = _kernel_id.c_str();
+            ZE_CHECK(zeKernelCreate(_module, &descriptor, &cloned_handle));
+            return std::make_shared<ze_kernel>(cloned_handle, _module, _kernel_id);
+        }
+    }
+
+    std::string get_id() const override { return _kernel_id; }
+
+    std::vector<uint8_t> get_binary() const override {
+        size_t binary_size = 0;
+        ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, nullptr));
+
+        std::vector<uint8_t> binary(binary_size);
+        ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, &binary[0]));
+
+        return binary;
+    }
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
new file mode 100644
index 00000000000000..3b65bf73e3c2a7
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -0,0 +1,236 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/runtime/utils.hpp"
+#include "ze_memory.hpp"
+#include "ze/ze_common.hpp"
+#include "ze_engine.hpp"
+#include "ze_stream.hpp"
+#include "ze_event.hpp"
+#include <stdexcept>
+#include <vector>
+
+namespace cldnn {
+namespace ze {
+namespace {
+static inline cldnn::event::ptr create_event(stream& stream, size_t bytes_count) {
+    if (bytes_count == 0) {
+        GPU_DEBUG_TRACE_DETAIL << "Skip memory operation for 0 size tensor" << std::endl;
+        return stream.create_user_event(true);
+    }
+
+    return stream.create_base_event();
+}
+
+}  // namespace
+
+allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) {
+    ze_memory_allocation_properties_t props{ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES};
+    ze_device_handle_t device = nullptr;
+    ZE_CHECK(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device));
+
+    switch (props.type) {
+        case ZE_MEMORY_TYPE_DEVICE: return allocation_type::usm_device;
+        case ZE_MEMORY_TYPE_HOST: return allocation_type::usm_host;
+        case ZE_MEMORY_TYPE_SHARED: return allocation_type::usm_shared;
+        default: return allocation_type::unknown;
+    }
+
+    return allocation_type::unknown;
+}
+
+allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer) {
+    auto alloc_type = detect_allocation_type(engine, buffer.get());
+    OPENVINO_ASSERT(alloc_type == allocation_type::usm_device ||
+                    alloc_type == allocation_type::usm_host ||
+                    alloc_type == allocation_type::usm_shared, "[GPU] Unsupported USM alloc type: " + to_string(alloc_type));
+    return alloc_type;
+}
+
+gpu_usm::gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& buffer, allocation_type type, std::shared_ptr<MemoryTracker> mem_tracker)
+    : lockable_gpu_mem()
+    , memory(engine, new_layout, type, mem_tracker)
+    , _buffer(buffer)
+    , _host_buffer(engine->get_context(), engine->get_device()) {
+}
+
+gpu_usm::gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& buffer, std::shared_ptr<MemoryTracker> mem_tracker)
+    : lockable_gpu_mem()
+    , memory(engine, new_layout, detect_allocation_type(engine, buffer), mem_tracker)
+    , _buffer(buffer)
+    , _host_buffer(engine->get_context(), engine->get_device()) {
+}
+
+gpu_usm::gpu_usm(ze_engine* engine, const layout& layout, allocation_type type)
+    : lockable_gpu_mem()
+    , memory(engine, layout, type, nullptr)
+    , _buffer(engine->get_context(), engine->get_device())
+    , _host_buffer(engine->get_context(), engine->get_device()) {
+    auto mem_ordinal = engine->get_device_info().device_memory_ordinal;
+    switch (get_allocation_type()) {
+    case allocation_type::usm_host:
+        _buffer.allocateHost(_bytes_count);
+        break;
+    case allocation_type::usm_shared:
+        _buffer.allocateShared(_bytes_count, mem_ordinal);
+        break;
+    case allocation_type::usm_device:
+        _buffer.allocateDevice(_bytes_count, mem_ordinal);
+        break;
+    default:
+        OPENVINO_THROW("[GPU] Unknown unified shared memory type!");
+    }
+
+    m_mem_tracker = std::make_shared<MemoryTracker>(engine, _buffer.get(), layout.bytes_count(), type);
+}
+
+void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) {
+    std::lock_guard<std::mutex> locker(_mutex);
+    if (0 == _lock_count) {
+        auto& _ze_stream = downcast<const ze_stream>(stream);
+        if (get_allocation_type() == allocation_type::usm_device) {
+            if (type != mem_lock_type::read) {
+                throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type.");
+            }
+            GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
+            _host_buffer.allocateHost(_bytes_count);
+            ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(),
+                                    _host_buffer.get(),
+                                    _buffer.get(),
+                                    _bytes_count,
+                                    nullptr,
+                                    0,
+                                    nullptr));
+            ZE_CHECK(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout));
+            _mapped_ptr = _host_buffer.get();
+        } else {
+            _mapped_ptr = _buffer.get();
+        }
+    }
+    _lock_count++;
+    return _mapped_ptr;
+}
+
+void gpu_usm::unlock(const stream& /* stream */) {
+    std::lock_guard<std::mutex> locker(_mutex);
+    _lock_count--;
+    if (0 == _lock_count) {
+        if (get_allocation_type() == allocation_type::usm_device) {
+            _host_buffer.freeMem();
+        }
+        _mapped_ptr = nullptr;
+    }
+}
+
+event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) {
+    auto& _ze_stream = downcast<ze_stream>(stream);
+    auto ev = _ze_stream.create_base_event();
+    auto ev_ze = downcast<ze::ze_base_event>(ev.get())->get();
+    std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
+    ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), _buffer.get(), temp_buffer.data(), 1, _bytes_count, ev_ze, 0, nullptr));
+
+    if (blocking) {
+        ev->wait();
+    }
+    return ev;
+}
+
+event::ptr gpu_usm::fill(stream& stream, bool blocking) {
+    return fill(stream, 0, blocking);
+}
+
+event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) {
+    auto result_event = create_event(stream, size);
+    if (size == 0)
+        return result_event;
+
+    auto _ze_stream = downcast<ze_stream>(&stream);
+    auto _ze_event = downcast<ze_base_event>(result_event.get())->get();
+    auto src_ptr = reinterpret_cast<const char*>(data_ptr) + src_offset;
+    auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
+
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+                                           dst_ptr,
+                                           src_ptr,
+                                           _bytes_count,
+                                           _ze_event,
+                                           0,
+                                           nullptr));
+
+    if (blocking) {
+        result_event->wait();
+    }
+
+    return result_event;
+}
+
+event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) {
+    auto result_event = create_event(stream, size);
+    if (size == 0)
+        return result_event;
+
+    auto _ze_stream = downcast<ze_stream>(&stream);
+    auto _ze_event = downcast<ze_base_event>(result_event.get())->get();
+    OPENVINO_ASSERT(memory_capabilities::is_usm_type(src_mem.get_allocation_type()));
+
+    auto usm_mem = downcast<const gpu_usm>(&src_mem);
+    auto src_ptr = reinterpret_cast<const char*>(usm_mem->buffer_ptr()) + src_offset;
+    auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
+
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+                                           dst_ptr,
+                                           src_ptr,
+                                           _bytes_count,
+                                           _ze_event,
+                                           0,
+                                           nullptr));
+    if (blocking) {
+        result_event->wait();
+    }
+
+    return result_event;
+}
+
+event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const {
+    auto result_event = create_event(stream, size);
+    if (size == 0)
+        return result_event;
+
+    auto _ze_stream = downcast<ze_stream>(&stream);
+    auto _ze_event = downcast<ze_base_event>(result_event.get())->get();
+    auto src_ptr = reinterpret_cast<const char*>(buffer_ptr()) + src_offset;
+    auto dst_ptr = reinterpret_cast<char*>(data_ptr) + dst_offset;
+
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+                                           dst_ptr,
+                                           src_ptr,
+                                           _bytes_count,
+                                           _ze_event,
+                                           0,
+                                           nullptr));
+    if (blocking) {
+        result_event->wait();
+    }
+
+    return result_event;
+}
+
+shared_mem_params gpu_usm::get_internal_params() const {
+    auto casted = downcast<ze_engine>(_engine);
+    return {
+        shared_mem_type::shared_mem_usm,  // shared_mem_type
+        static_cast<shared_handle>(casted->get_context()),  // context handle
+        static_cast<shared_handle>(casted->get_device()),  // user_device handle
+        static_cast<shared_handle>(_buffer.get()),  // mem handle
+#ifdef _WIN32
+        nullptr,  // surface handle
+#else
+        0,  // surface handle
+#endif
+        0  // plane
+    };
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
new file mode 100644
index 00000000000000..b1914de00bec49
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
@@ -0,0 +1,153 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_common.hpp"
+#include "ze_engine.hpp"
+#include "intel_gpu/runtime/memory.hpp"
+
+#include <cassert>
+#include <mutex>
+#include <memory>
+
+namespace cldnn {
+namespace ze {
+struct lockable_gpu_mem {
+    lockable_gpu_mem() :
+        _lock_count(0),
+        _mapped_ptr(nullptr) {}
+
+    std::mutex _mutex;
+    unsigned _lock_count;
+    void* _mapped_ptr;
+};
+
+class UsmHolder {
+public:
+    UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) { }
+    void* ptr() { return _ptr; }
+    void memFree() {
+        try {
+            if (!_shared_memory)
+                zeMemFree(_context, _ptr);
+        } catch (...) {
+            // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly
+        }
+        _ptr = nullptr;
+    }
+
+    ~UsmHolder() {
+        memFree();
+    }
+private:
+    ze_context_handle_t _context;
+    void* _ptr;
+    bool _shared_memory = false;
+};
+
+class UsmMemory {
+public:
+    explicit UsmMemory(ze_context_handle_t context, ze_device_handle_t device)
+        : _context(context)
+        , _device(device) {}
+
+    UsmMemory(ze_context_handle_t context, ze_device_handle_t device, void* usm_ptr, size_t offset = 0)
+        : _context(context)
+        , _device(device)
+        , _usm_pointer(std::make_shared<UsmHolder>(_context, reinterpret_cast<uint8_t*>(usm_ptr) + offset, true)) {}
+
+    // Get methods returns original pointer allocated by openCL.
+    void* get() const { return _usm_pointer->ptr(); }
+
+    void allocateHost(size_t size) {
+        ze_host_mem_alloc_desc_t host_desc = {};
+        host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+        host_desc.flags = 0;
+        host_desc.pNext = nullptr;
+
+        void* memory = nullptr;
+        ZE_CHECK(zeMemAllocHost(_context, &host_desc, size, 1, &memory));
+        _allocate(memory);
+    }
+
+    void allocateShared(size_t size, uint32_t ordinal) {
+        ze_device_mem_alloc_desc_t device_desc = {};
+        device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+        device_desc.flags = 0;
+        device_desc.ordinal = ordinal;
+        device_desc.pNext = nullptr;
+
+        ze_host_mem_alloc_desc_t host_desc = {};
+        host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+        host_desc.flags = 0;
+        host_desc.pNext = nullptr;
+
+        void* memory = nullptr;
+        ZE_CHECK(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory));
+        _allocate(memory);
+    }
+
+    void allocateDevice(size_t size, uint32_t ordinal) {
+        ze_device_mem_alloc_desc_t device_desc = {};
+        device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+        device_desc.flags = 0;
+        device_desc.ordinal = ordinal;
+        device_desc.pNext = nullptr;
+
+        void* memory = nullptr;
+        ZE_CHECK(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory));
+        _allocate(memory);
+    }
+
+    void freeMem() {
+        if (!_usm_pointer)
+            throw std::runtime_error("[CL ext] Can not free memory of empty UsmHolder");
+        _usm_pointer->memFree();
+    }
+
+    virtual ~UsmMemory() = default;
+
+protected:
+    ze_context_handle_t _context;
+    ze_device_handle_t _device;
+    std::shared_ptr<UsmHolder> _usm_pointer = nullptr;
+
+private:
+    void _allocate(void* ptr) {
+        if (!ptr)
+            throw std::runtime_error("[CL ext] Can not allocate nullptr for USM type.");
+        _usm_pointer = std::make_shared<UsmHolder>(_context, ptr);
+    }
+};
+
+struct gpu_usm : public lockable_gpu_mem, public memory {
+    gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& usm_buffer, allocation_type type, std::shared_ptr<MemoryTracker> mem_tracker);
+    gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& usm_buffer, std::shared_ptr<MemoryTracker> mem_tracker);
+    gpu_usm(ze_engine* engine, const layout& layout, allocation_type type);
+
+    void* lock(const stream& stream, mem_lock_type type) override;
+    void unlock(const stream& stream) override;
+    const ze::UsmMemory& get_buffer() const { return _buffer; }
+    ze::UsmMemory& get_buffer() { return _buffer; }
+
+    event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override;
+    event::ptr fill(stream& stream, bool blocking = true) override;
+    shared_mem_params get_internal_params() const override;
+    void* buffer_ptr() const override { return _buffer.get(); }
+
+    event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override;
+    event::ptr copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override;
+    event::ptr copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const override;
+
+    static allocation_type detect_allocation_type(const ze_engine* engine, const void* mem_ptr);
+    static allocation_type detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer);
+
+protected:
+    ze::UsmMemory _buffer;
+    ze::UsmMemory _host_buffer;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
new file mode 100644
index 00000000000000..c7341c9d30b5a5
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -0,0 +1,348 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_stream.hpp"
+#include "intel_gpu/runtime/memory_caps.hpp"
+#include "intel_gpu/runtime/utils.hpp"
+#include "openvino/core/except.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/runtime/properties.hpp"
+#include "ze_event_pool.hpp"
+#include "ze_event.hpp"
+#include "ze_kernel.hpp"
+#include "ze_memory.hpp"
+#include "ze_common.hpp"
+
+#include <ze_api.h>
+#include <ze_intel_gpu.h>
+#include <ze_stypes.h>
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace cldnn {
+namespace ze {
+
+namespace {
+inline ze_group_count_t to_group_count(const std::vector<size_t>& v) {
+     switch (v.size()) {
+        case 1:
+            return {uint32_t(v[0]), uint32_t(1), uint32_t(1)};
+        case 2:
+            return {uint32_t(v[0]), uint32_t(v[1]), uint32_t(1)};
+        case 3:
+            return {uint32_t(v[0]), uint32_t(v[1]), uint32_t(v[2])};
+        default:
+            return {uint32_t(1), uint32_t(1), uint32_t(1)};
+    }
+}
+
+template<typename T>
+ze_result_t set_kernel_arg_scalar(ze_kernel_handle_t& kernel, uint32_t idx, const T& val) {
+    GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set scalar " << idx << " (" << ov::element::from<T>().get_type_name() << ")" << val << "\n";
+    return zeKernelSetArgumentValue(kernel, idx, sizeof(T), &val);
+}
+
+ze_result_t set_kernel_arg(ze_kernel_handle_t& kernel, uint32_t idx, cldnn::memory::cptr mem) {
+    if (!mem)
+        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
+
+    OPENVINO_ASSERT(memory_capabilities::is_usm_type(mem->get_allocation_type()), "Unsupported alloc type");
+    const auto& buf = std::dynamic_pointer_cast<const ze::gpu_usm>(mem)->get_buffer();
+    auto mem_type = std::dynamic_pointer_cast<const ze::gpu_usm>(mem)->get_allocation_type();
+    GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set arg (" << mem_type << ") " << idx
+                            << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
+
+    auto ptr = buf.get();
+    return zeKernelSetArgumentValue(kernel, idx, sizeof(ptr), &ptr);
+}
+
+void set_arguments_impl(ze_kernel_handle_t kernel,
+                         const arguments_desc& args,
+                         const kernel_arguments_data& data) {
+    using args_t = argument_desc::Types;
+    using scalar_t = scalar_desc::Types;
+
+    for (uint32_t i = 0; i < static_cast<uint32_t>(args.size()); i++) {
+        ze_result_t status = ZE_RESULT_NOT_READY;
+        switch (args[i].t) {
+            case args_t::INPUT:
+                if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) {
+                    status = set_kernel_arg(kernel, i, data.inputs[args[i].index]);
+                }
+                break;
+            case args_t::INPUT_OF_FUSED_PRIMITIVE:
+                if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) {
+                    status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]);
+                }
+                break;
+            case args_t::INTERNAL_BUFFER:
+                if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) {
+                    status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]);
+                }
+                break;
+            case args_t::OUTPUT:
+                if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) {
+                    status = set_kernel_arg(kernel, i, data.outputs[args[i].index]);
+                }
+                break;
+            case args_t::WEIGHTS:
+                status = set_kernel_arg(kernel, i, data.weights);
+                break;
+            case args_t::BIAS:
+                status = set_kernel_arg(kernel, i, data.bias);
+                break;
+            case args_t::WEIGHTS_ZERO_POINTS:
+                status = set_kernel_arg(kernel, i, data.weights_zero_points);
+                break;
+            case args_t::ACTIVATIONS_ZERO_POINTS:
+                status = set_kernel_arg(kernel, i, data.activations_zero_points);
+                break;
+            case args_t::COMPENSATION:
+                status = set_kernel_arg(kernel, i, data.compensation);
+                break;
+            case args_t::SCALE_TABLE:
+                status = set_kernel_arg(kernel, i, data.scale_table);
+                break;
+            case args_t::SLOPE:
+                status = set_kernel_arg(kernel, i, data.slope);
+                break;
+            case args_t::SCALAR:
+                if (data.scalars && args[i].index < data.scalars->size()) {
+                    const auto& scalar = (*data.scalars)[args[i].index];
+                    switch (scalar.t) {
+                        case scalar_t::UINT8:
+                            status = set_kernel_arg_scalar<uint8_t>(kernel, i, scalar.v.u8);
+                            break;
+                        case scalar_t::UINT16:
+                            status = set_kernel_arg_scalar<uint16_t>(kernel, i, scalar.v.u16);
+                            break;
+                        case scalar_t::UINT32:
+                            status = set_kernel_arg_scalar<uint32_t>(kernel, i, scalar.v.u32);
+                            break;
+                        case scalar_t::UINT64:
+                            status = set_kernel_arg_scalar<uint64_t>(kernel, i, scalar.v.u64);
+                            break;
+                        case scalar_t::INT8:
+                            status = set_kernel_arg_scalar<int8_t>(kernel, i, scalar.v.s8);
+                            break;
+                        case scalar_t::INT16:
+                            status = set_kernel_arg_scalar<int16_t>(kernel, i, scalar.v.s16);
+                            break;
+                        case scalar_t::INT32:
+                            status = set_kernel_arg_scalar<int32_t>(kernel, i, scalar.v.s32);
+                            break;
+                        case scalar_t::INT64:
+                            status = set_kernel_arg_scalar<int64_t>(kernel, i, scalar.v.s64);
+                            break;
+                        case scalar_t::FLOAT32:
+                            status = set_kernel_arg_scalar<float>(kernel, i, scalar.v.f32);
+                            break;
+                        case scalar_t::FLOAT64:
+                            status = set_kernel_arg_scalar<double>(kernel, i, scalar.v.f64);
+                            break;
+                        default:
+                            break;
+                    }
+                }
+                break;
+            case args_t::CELL:
+                status = set_kernel_arg(kernel, i, data.cell);
+                break;
+            case args_t::SHAPE_INFO:
+                status = set_kernel_arg(kernel, i, data.shape_info);
+                break;
+            default:
+                break;
+        }
+        if (status != ZE_RESULT_SUCCESS) {
+            throw std::runtime_error("Error set arg " + std::to_string(i) + ", error code: " + std::to_string(status) + "\n");
+        }
+    }
+}
+
+}  // namespace
+
+ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
+    : stream(config.get_queue_type(), stream::get_expected_sync_method(config))
+    , _engine(engine)
+    , m_pool(engine, config.get_enable_profiling()) {
+    const auto &info = engine.get_device_info();
+
+    ze_command_queue_desc_t command_queue_desc = {};
+    command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
+    command_queue_desc.pNext = nullptr;
+    command_queue_desc.index = 0;
+    command_queue_desc.ordinal = info.compute_queue_group_ordinal;
+    command_queue_desc.flags = m_queue_type == QueueTypes::out_of_order ? 0 : ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
+    command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
+    command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
+
+    zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {};
+    cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES;
+    cp_offload_desc.copyOffloadEnabled = true;
+    cp_offload_desc.pNext = nullptr;
+    command_queue_desc.pNext = &cp_offload_desc;
+
+    ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
+}
+
+ze_stream::~ze_stream() {
+    zeCommandListDestroy(m_command_list);
+}
+
+void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) {
+    static std::mutex m;
+    std::lock_guard<std::mutex> guard(m);
+
+    auto& ze_kernel = downcast<ze::ze_kernel>(kernel);
+    auto& kern = ze_kernel.get_handle();
+    set_arguments_impl(kern, args_desc.arguments, args);
+}
+
+event::ptr ze_stream::enqueue_kernel(kernel& kernel,
+                                     const kernel_arguments_desc& args_desc,
+                                     const kernel_arguments_data& /* args */,
+                                     std::vector<event::ptr> const& deps,
+                                     bool is_output) {
+    auto& ze_kernel = downcast<ze::ze_kernel>(kernel);
+
+    auto& kern = ze_kernel.get_handle();
+
+    std::vector<ze_event_handle_t> dep_events;
+    std::vector<ze_event_handle_t>* dep_events_ptr = nullptr;
+    if (m_sync_method == SyncMethods::events) {
+        for (auto& dep : deps) {
+            if (auto ze_base_ev = std::dynamic_pointer_cast<ze_base_event>(dep)) {
+                if (ze_base_ev->get() != nullptr)
+                    dep_events.push_back(ze_base_ev->get());
+            }
+        }
+        dep_events_ptr = &dep_events;
+    } else if (m_sync_method == SyncMethods::barriers) {
+        sync_events(deps, is_output);
+    }
+    bool set_output_event = m_sync_method == SyncMethods::events || is_output;
+
+    auto ev = set_output_event ? create_base_event() : std::make_shared<ze_event>(nullptr, nullptr, ++m_queue_counter);
+    auto global = to_group_count(args_desc.workGroups.global);
+    auto local = to_group_count(args_desc.workGroups.local);
+    ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ };
+    ZE_CHECK(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ));
+    ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list,
+                                             kern,
+                                             &args,
+                                             set_output_event ? std::dynamic_pointer_cast<ze_base_event>(ev)->get() : nullptr,
+                                             dep_events_ptr == nullptr ? 0 : static_cast<uint32_t>(dep_events_ptr->size()),
+                                             dep_events_ptr == nullptr ? 0 : &dep_events_ptr->front()));
+
+    return ev;
+}
+
+void ze_stream::enqueue_barrier() {
+    ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
+}
+
+event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, bool is_output) {
+    if (deps.empty()) {
+        auto ev = create_base_event();
+        ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast<ze_base_event>(ev)->get(), 0, nullptr));
+        return ev;
+    }
+
+    if (m_sync_method  == SyncMethods::events) {
+        std::vector<ze_event_handle_t> dep_events;
+        for (auto& dep : deps) {
+            if (auto ze_base_ev = std::dynamic_pointer_cast<ze_base_event>(dep)) {
+                if (ze_base_ev->get() != nullptr)
+                    dep_events.push_back(ze_base_ev->get());
+            }
+        }
+        if (dep_events.empty())
+            return create_user_event(true);
+
+        auto ev = create_base_event();
+        ZE_CHECK(zeCommandListAppendBarrier(m_command_list,
+                                            std::dynamic_pointer_cast<ze_base_event>(ev)->get(),
+                                            static_cast<uint32_t>(dep_events.size()),
+                                            &dep_events.front()));
+        return ev;
+    } else if (m_sync_method == SyncMethods::barriers) {
+        sync_events(deps, is_output);
+        assert(m_last_barrier_ev != nullptr);
+        return m_last_barrier_ev;
+    } else {
+        return create_user_event(true);
+    }
+}
+
+ze_event::ptr ze_stream::group_events(std::vector<ze_events::ptr> const& deps) {
+    return std::make_shared<ze_events>(deps);
+}
+
+void ze_stream::wait() {
+    finish();
+}
+
+event::ptr ze_stream::create_user_event(bool set) {
+    auto ev = m_pool.create_user_event();
+    if (set)
+        ev->set();
+
+    return ev;
+}
+
+event::ptr ze_stream::create_base_event() {
+    return m_pool.create_event(++m_queue_counter);
+}
+
+void ze_stream::flush() const { }
+
+void ze_stream::finish() const {
+    ZE_CHECK(zeCommandListHostSynchronize(m_command_list, default_timeout));
+}
+
+void ze_stream::wait_for_events(const std::vector<event::ptr>& events) {
+    for (auto& ev : events) {
+        ev->wait();
+    }
+
+    // Enqueue additional event as `events` may contain user events only due to barrier based synchronization
+    // TODO: Detect that scenarion somehow and don't enqueue extra barrier if not needed
+    auto ev = std::dynamic_pointer_cast<ze_event>(create_base_event());
+    ZE_CHECK(zeCommandListAppendBarrier(m_command_list, ev->get(), 0, nullptr));
+    ev->wait();
+}
+
+void ze_stream::sync_events(std::vector<event::ptr> const& deps, bool is_output) {
+    bool needs_barrier = false;
+    for (auto& dep : deps) {
+        auto* ze_base_ev = dynamic_cast<ze_base_event*>(dep.get());
+        assert(ze_base_ev != nullptr);
+        if (ze_base_ev->get_queue_stamp() > m_last_barrier) {
+            needs_barrier = true;
+        }
+    }
+
+    if (needs_barrier) {
+        if (is_output) {
+            m_last_barrier_ev = std::dynamic_pointer_cast<ze_event>(create_base_event());
+            m_last_barrier_ev->set_queue_stamp(m_queue_counter.load());
+            ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get(), 0, nullptr));
+        } else {
+            ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
+        }
+        m_last_barrier = ++m_queue_counter;
+    }
+
+    if (!m_last_barrier_ev) {
+        m_last_barrier_ev = std::dynamic_pointer_cast<ze_event>(create_user_event(true));
+        m_last_barrier_ev->set_queue_stamp(m_queue_counter.load());
+    }
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
new file mode 100644
index 00000000000000..8af269489fbe3b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/event.hpp"
+#include "intel_gpu/runtime/stream.hpp"
+#include "ze_common.hpp"
+#include "ze_engine.hpp"
+#include "ze_event.hpp"
+
+namespace cldnn {
+namespace ze {
+
+class ze_stream : public stream {
+public:
+    ze_command_list_handle_t get_queue() const { return m_command_list; }
+
+    ze_stream(const ze_engine& engine, const ExecutionConfig& config);
+    ze_stream(ze_stream&& other)
+        : stream(other.m_queue_type, other.m_sync_method)
+        , _engine(other._engine)
+        , m_command_list(other.m_command_list)
+        , m_queue_counter(other.m_queue_counter.load())
+        , m_last_barrier(other.m_last_barrier.load())
+        , m_last_barrier_ev(other.m_last_barrier_ev)
+        , m_pool(other.m_pool) {}
+
+    ~ze_stream();
+
+    void flush() const override;
+    void finish() const override;
+    void wait() override;
+
+    void set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) override;
+    event::ptr enqueue_kernel(kernel& kernel,
+                              const kernel_arguments_desc& args_desc,
+                              const kernel_arguments_data& args,
+                              std::vector<event::ptr> const& deps,
+                              bool is_output = false) override;
+    event::ptr enqueue_marker(std::vector<event::ptr> const& deps, bool is_output) override;
+    event::ptr group_events(std::vector<event::ptr> const& deps) override;
+    void wait_for_events(const std::vector<event::ptr>& events) override;
+    void enqueue_barrier() override;
+    event::ptr create_user_event(bool set) override;
+    event::ptr create_base_event() override;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    dnnl::stream& get_onednn_stream() override { OPENVINO_NOT_IMPLEMENTED; }
+#endif
+
+private:
+    void sync_events(std::vector<event::ptr> const& deps, bool is_output = false);
+
+    const ze_engine& _engine;
+    mutable ze_command_list_handle_t m_command_list = 0;
+    mutable std::atomic<uint64_t> m_queue_counter{0};
+    std::atomic<uint64_t> m_last_barrier{0};
+    std::shared_ptr<ze_event> m_last_barrier_ev = nullptr;
+    ze_events_pool m_pool;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    std::shared_ptr<dnnl::stream> _onednn_stream = nullptr;
+#endif
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
index c89083a026aed6..f08e5c8b8ac798 100644
--- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
@@ -39,6 +39,8 @@ ov_add_test_target(
             OV GPU
 )
 
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
+
 if(ENABLE_PROXY)
     target_compile_definitions(${TARGET_NAME} PUBLIC PROXY_PLUGIN_ENABLED)
 endif()
diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp
similarity index 99%
rename from src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp
rename to src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp
index 062102e4580ee6..ad631e2ad76daf 100644
--- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp
+++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#ifdef OV_GPU_WITH_OCL_RT
+
 #include "openvino/core/preprocess/pre_post_process.hpp"
 #include "openvino/op/add.hpp"
 #include "openvino/op/constant.hpp"
@@ -2962,3 +2964,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_RemoteTensorDataType, OVRemoteTensorDataType_Test
                                                               ov::element::Type_t::u16,
                                                               ov::element::Type_t::u32)),
                          OVRemoteTensorDataType_Test::getTestCaseName);
+#endif  // OV_GPU_WITH_OCL_RT
diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
new file mode 100644
index 00000000000000..e9ccd21cc892be
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef OV_GPU_WITH_ZE_RT
+
+#include "openvino/runtime/intel_gpu/remote_properties.hpp"
+#include "openvino/runtime/remote_tensor.hpp"
+
+#include "remote_tensor_tests/helpers.hpp"
+#include "base/ov_behavior_test_utils.hpp"
+
+TEST(ZeRemoteContext, smoke_CorrectContextType) {
+    auto core = ov::Core();
+    auto remote_context = core.get_default_context(ov::test::utils::DEVICE_GPU);
+    ASSERT_FALSE(remote_context.is<ov::intel_gpu::ocl::ClContext>());
+    ASSERT_EQ(remote_context.get_params().at(ov::intel_gpu::context_type.name()), ov::intel_gpu::ContextType::ZE);
+}
+
+#endif  // OV_GPU_WITH_ZE_RT
diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
index aa40a800295f02..0ba7a6e8c33e88 100644
--- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
@@ -59,6 +59,7 @@ add_executable(${TARGET_NAME} ${SOURCES_ALL})
 target_compile_definitions(${TARGET_NAME} PRIVATE CI_BUILD_NUMBER="")
 
 ov_set_threading_interface_for(${TARGET_NAME})
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 # Workaround to avoid warnings during LTO build
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp
index a12d542b076498..98184955308b2b 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp
@@ -56,7 +56,7 @@ TEST(devices_test, sort_order_single_vendor) {
     devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
     devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
 
-    auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
+    auto sorted_list = sort_devices(devices_list);
 
     std::vector<size_t> expected_devices_order = {2, 0, 1, 3, 4};
 
@@ -77,7 +77,7 @@ TEST(devices_test, sort_order_two_vendors) {
     devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
     devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
 
-    auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
+    auto sorted_list = sort_devices(devices_list);
 
     std::vector<size_t> expected_devices_order = {3, 2, 0, 1};
 
@@ -101,7 +101,7 @@ TEST(devices_test, sort_order_three_vendors) {
     devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++));
     devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++));
 
-    auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
+    auto sorted_list = sort_devices(devices_list);
 
     std::vector<size_t> expected_devices_order = {2, 3, 0, 1, 4, 5};
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
index 4e3942cf578758..239e1d1b34a49b 100644
--- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
@@ -302,7 +302,7 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine,
 }
 
 std::shared_ptr<cldnn::engine> create_test_engine() {
-    auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl);
+    auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze);
 #ifdef ENABLE_ONEDNN_FOR_GPU
     if (ret->get_device_info().supports_immad)
         ret->create_onednn_engine({});
diff --git a/thirdparty/dependencies.cmake b/thirdparty/dependencies.cmake
index 70318227538112..e3317d176e11c4 100644
--- a/thirdparty/dependencies.cmake
+++ b/thirdparty/dependencies.cmake
@@ -68,7 +68,7 @@ endif()
 # LevelZero
 #
 
-if(ENABLE_INTEL_NPU)
+if(ENABLE_INTEL_GPU OR ENABLE_INTEL_NPU)
     if(ENABLE_SYSTEM_LEVEL_ZERO)
         pkg_search_module(level_zero QUIET
                           IMPORTED_TARGET
diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt
index a266b0d575abd5..6675adc586caee 100644
--- a/thirdparty/level_zero/CMakeLists.txt
+++ b/thirdparty/level_zero/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
 set(CMAKE_COMPILE_WARNING_AS_ERROR OFF)
 add_subdirectory(level-zero EXCLUDE_FROM_ALL)
 
-set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include>)
+set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include ${CMAKE_CURRENT_SOURCE_DIR}>)
 
 # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it
 file(REMOVE "${CMAKE_BINARY_DIR}/VERSION")
diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/ze_intel_gpu.h
new file mode 100644
index 00000000000000..ea83b8e9f6cad9
--- /dev/null
+++ b/thirdparty/level_zero/ze_intel_gpu.h
@@ -0,0 +1,412 @@
+// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5
+/*
+ * Copyright (C) 2020-2025 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#ifndef _ZE_INTEL_GPU_H
+#define _ZE_INTEL_GPU_H
+
+#include <ze_api.h>
+
+#include "ze_stypes.h"
+
+#if defined(__cplusplus)
+#pragma once
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#define ZE_INTEL_GPU_VERSION_MAJOR 0
+#define ZE_INTEL_GPU_VERSION_MINOR 1
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME
+/// @brief Module DP properties driver extension name
+#define ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME "ZE_intel_experimental_device_module_dp_properties"
+#endif // ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module DP properties driver extension Version(s)
+typedef enum _ze_intel_device_module_dp_properties_exp_version_t {
+    ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_module_dp_properties_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported Dot Product flags
+typedef uint32_t ze_intel_device_module_dp_exp_flags_t;
+typedef enum _ze_intel_device_module_dp_exp_flag_t {
+    ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DP4A = ZE_BIT(0), ///< Supports DP4A operation
+    ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS = ZE_BIT(1), ///< Supports DPAS operation
+    ZE_INTEL_DEVICE_MODULE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_module_dp_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device Module dot product properties queried using
+///        ::zeDeviceGetModuleProperties
+///
+/// @details
+///     - This structure may be passed to ::zeDeviceGetModuleProperties, via
+///       `pNext` member of ::ze_device_module_properties_t.
+/// @brief Device module dot product properties
+typedef struct _ze_intel_device_module_dp_exp_properties_t {
+    ze_structure_type_ext_t stype = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES; ///< [in] type of this structure
+    void *pNext;                                                                        ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                                        ///< structure (i.e. contains sType and pNext).
+    ze_intel_device_module_dp_exp_flags_t flags;                                        ///< [out] 0 (none) or a valid combination of ::ze_intel_device_module_dp_flag_t
+} ze_intel_device_module_dp_exp_properties_t;
+
+#ifndef ZE_INTEL_COMMAND_LIST_MEMORY_SYNC
+/// @brief Cmd List memory sync extension name
+#define ZE_INTEL_COMMAND_LIST_MEMORY_SYNC "ZE_intel_experimental_command_list_memory_sync"
+#endif // ZE_INTEL_COMMAND_LIST_MEMORY_SYNC
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Cmd List memory sync extension Version(s)
+typedef enum _ze_intel_command_list_memory_sync_exp_version_t {
+    ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_command_list_memory_sync_exp_version_t;
+
+#ifndef ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC
+/// @brief stype for _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extended descriptor for cmd list memory sync
+///
+/// @details
+///     - Implementation must support ::ZE_intel_experimental_command_list_memory_sync extension
+///     - May be passed to ze_device_properties_t through pNext.
+typedef struct _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t {
+    ze_structure_type_ext_t stype;               ///< [in] type of this structure
+    const void *pNext;                           ///< [in][optional] must be null or a pointer to an extension-specific
+                                                 ///< structure (i.e. contains stype and pNext).
+    uint32_t cmdListWaitOnMemoryDataSizeInBytes; /// <out> Defines supported data size for zexCommandListAppendWaitOnMemory[64] API
+} ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t;
+
+#ifndef ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME
+/// @brief Event sync mode extension name
+#define ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME "ZEX_intel_experimental_event_sync_mode"
+#endif // ZE_INTEL_EVENT_SYNC_MODE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event sync mode extension Version(s)
+typedef enum _zex_intel_event_sync_mode_exp_version_t {
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} zex_intel_event_sync_mode_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event sync mode flags
+typedef uint32_t zex_intel_event_sync_mode_exp_flags_t;
+typedef enum _zex_intel_event_sync_mode_exp_flag_t {
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT = ZE_BIT(0),          ///< Low power host synchronization mode, for better CPU utilization
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT = ZE_BIT(1),        ///< Generate interrupt when Event is signalled on Device
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT = ZE_BIT(2), ///< Host synchronization APIs wait for external interrupt. Can be used only for Events created via zexCounterBasedEventCreate
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zex_intel_event_sync_mode_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extended descriptor for event sync mode
+///
+/// @details
+///     - Implementation must support ::ZEX_intel_experimental_event_sync_mode extension
+///     - May be passed to ze_event_desc_t through pNext.
+typedef struct _zex_intel_event_sync_mode_exp_desc_t {
+    ze_structure_type_ext_t stype;                       ///< [in] type of this structure
+    const void *pNext;                                   ///< [in][optional] must be null or a pointer to an extension-specific
+                                                         ///< structure (i.e. contains stype and pNext).
+    zex_intel_event_sync_mode_exp_flags_t syncModeFlags; /// <in> valid combination of ::ze_intel_event_sync_mode_exp_flag_t
+    uint32_t externalInterruptId;                        /// <in> External interrupt id. Used only when ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT flag is set
+} zex_intel_event_sync_mode_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zex_intel_queue_allocate_msix_hint_exp_desc_t
+typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t zex_intel_queue_allocate_msix_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue descriptor for allocating unique msix. This structure may be
+/// passed as pNext member of ::ze_command_queue_desc_t.
+
+typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    const void *pNext;             ///< [in][optional] must be null or a pointer to an extension-specific
+                                   ///< structure (i.e. contains stype and pNext).
+    ze_bool_t uniqueMsix;          ///< [in] If set, try to allocate unique msix for command queue.
+                                   ///< If not set, driver will follow default behaviour. It may share msix for signaling completion with other queues.
+                                   ///< Number of unique msixes may be limited. On unsuccessful allocation, queue or immediate cmd list creation API fallbacks to default behaviour.
+
+} zex_intel_queue_allocate_msix_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue descriptor for enabling copy operations offload. This structure may be
+/// passed as pNext member of ::ze_command_queue_desc_t.
+
+typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    const void *pNext;             ///< [in][optional] must be null or a pointer to an extension-specific
+                                   ///< structure (i.e. contains stype and pNext).
+    ze_bool_t copyOffloadEnabled;  ///< [in] If set, try to offload copy operations to different engines. Applicable only for compute queues.
+                                   ///< This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics.
+                                   ///< If not set, driver will follow default behaviour. Copy operations will be submitted to same engine as compute operations.
+
+} zex_intel_queue_copy_operations_offload_hint_exp_desc_t;
+
+#ifndef ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME
+/// @brief Queue copy operations offload hint extension name
+#define ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME "ZEX_intel_experimental_queue_copy_operations_offload_hint"
+#endif // ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queue copy operations offload hint extension version(s)
+typedef enum _zex_intel_queue_copy_operations_offload_hint_exp_version_t {
+    ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} zex_intel_queue_copy_operations_offload_hint_exp_version_t;
+
+#ifndef ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME
+/// @brief Extension name for query to read the Intel Level Zero Driver Version String
+#define ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME "ZE_intel_get_driver_version_string"
+#endif // ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query to read the Intel Level Zero Driver Version String extension version(s)
+typedef enum _ze_intel_get_driver_version_string_exp_version_t {
+    ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_get_driver_version_string_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported 2D Block Array flags
+typedef uint32_t ze_intel_device_block_array_exp_flags_t;
+typedef enum _ze_intel_device_block_array_exp_flag_t {
+    ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_STORE = ZE_BIT(0), ///< Supports store operation
+    ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_LOAD = ZE_BIT(1),  ///< Supports load operation
+    ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_block_array_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+/// @brief Device 2D block array properties driver extension name
+#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME "ZE_intel_experimental_device_block_array_properties"
+#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+
+/// @brief Device 2D block array properties queried using
+///        ::zeDeviceGetProperties
+///
+/// @details
+///     - This structure may be passed to ::zeDeviceGetProperties, via
+///       `pNext` member of ::ze_device_properties_t.
+/// @brief Device 2D block array properties
+
+typedef struct _ze_intel_device_block_array_exp_properties_t {
+    ze_structure_type_ext_t stype = ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES; ///< [in] type of this structure
+    void *pNext;                                                                ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                                ///< structure (i.e. contains sType and pNext).
+    ze_intel_device_block_array_exp_flags_t flags;                              ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t
+} ze_intel_device_block_array_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device 2D block array properties driver extension versions
+typedef enum _ze_intel_device_block_array_exp_properties_version_t {
+    ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0
+    ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_block_array_exp_properties_version_t;
+
+/// @brief Query to read the Intel Level Zero Driver Version String
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - The Driver Version String will be in the format:
+///     - Major.Minor.Patch+Optional per semver guidelines https://semver.org/#spec-item-10
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+ze_result_t ZE_APICALL
+zeIntelGetDriverVersionString(
+    ze_driver_handle_t hDriver, ///< [in] Driver handle whose version is being read.
+    char *pDriverVersion,       ///< [in,out] pointer to driver version string.
+    size_t *pVersionSize);      ///< [in,out] pointer to the size of the driver version string.
+                                ///< if size is zero, then the size of the version string is returned.
+
+/// @brief Get Kernel Program Binary
+///
+/// @details
+///     - A valid kernel handle must be created with zeKernelCreate.
+///     - Returns Intel Graphics Assembly (GEN ISA) format binary program data for kernel handle.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+
+#ifndef ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME
+/// @brief Get Kernel Program Binary experimental name
+#define ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME "ZE_intel_experimental_kernel_get_program_binary"
+#endif // ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intel Kernel Get Binary Extension Version(s)
+typedef enum _ze_intel_kernel_get_binary_exp_version_t {
+    ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_kernel_get_binary_exp_version_t;
+
+ze_result_t ZE_APICALL
+zeIntelKernelGetBinaryExp(
+    ze_kernel_handle_t hKernel, ///< [in] Kernel handle
+    size_t *pSize,              ///< [in, out] pointer to variable with size of GEN ISA binary
+    char *pKernelBinary         ///< [in,out] pointer to storage area for GEN ISA binary function
+);
+
+/// @brief Get default context associated with driver
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - Default context contains all devices within driver instance
+/// @returns
+///     - Context handle associated with driver
+ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver
+
+/// @brief Get default context associated with default driver
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - Default context contains all devices within default driver instance
+/// @returns
+///     - Context handle associated with default driver
+ze_context_handle_t ZE_APICALL zerDriverGetDefaultContext();
+
+/// @brief Get Device Identifier
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - Returned identifier is a 32-bit unsigned integer that is unique to the driver.
+///     - The identifier can be used then in zerIdentifierTranslateToDeviceHandle to get the device handle.
+/// @returns
+///     - 32-bit unsigned integer identifier
+uint32_t ZE_APICALL zerDeviceTranslateToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device
+
+/// @brief Translate Device Identifier to Device Handle from default Driver
+///
+/// @details
+///    - The application may call this function from simultaneous threads.
+///    - The implementation of this function should be lock-free.
+///    - Returned device is associated to default driver handle.
+/// @returns
+///     - device handle associated with the identifier
+ze_device_handle_t ZE_APICALL zerIdentifierTranslateToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device
+
+/// @brief Global device synchronization
+///
+/// @details
+///    - The application may call this function from simultaneous threads.
+///    - The implementation of this function should be lock-free.
+///    - Ensures that everything that was submitted to the device is completed.
+///    - Ensures that all submissions in all queues on device are completed.
+///    - It is not allowed to call this function while some command list are in graph capture mode.
+///    - Returns error if error is detected during execution on device.
+///    - Hangs indefinitely if GPU execution is blocked on non signaled event.
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+ze_result_t ZE_APICALL zeDeviceSynchronize(ze_device_handle_t hDevice); ///> [in] handle of the device
+
+/// @brief Get priority levels
+///
+/// @details
+///    - The application may call this function from simultaneous threads.
+///    - The implementation of this function should be lock-free.
+///    - Returns priority levels supported by the device
+///    - lowestPriority reports the numerical value that corresponds to lowest queue priority
+///    - highesPriority reports the numerical value that corresponds to highest queue priority
+///    - Lower numbers indicate greater priorities
+///    - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority]
+///    - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value
+///    - 0 means default priority
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+ze_result_t ZE_APICALL zeDeviceGetPriorityLevels(
+    ze_device_handle_t hDevice,
+    int *lowestPriority,
+    int *highestPriority);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Descriptor used for setting priority on command queues and immediate command lists.
+/// This structure may be passed as pNext member of ::ze_command_queue_desc_t.
+typedef struct _ze_queue_priority_desc_t {
+    ze_structure_type_t stype; ///< [in] type of this structure
+    const void *pNext;         ///< [in][optional] must be null or a pointer to an extension-specific structure
+    int priority;              ///< [in] priority of the queue
+} ze_queue_priority_desc_t;
+
+/// @brief Append with arguments
+///
+/// @details
+///    - The application may call this function from simultaneous threads.
+///    - The implementation of this function should be lock-free.
+///    - Appends kernel to command list with arguments.
+///    - Kernel object state is updated with new arguments, as if separate zeKernelSetArgumentValue were called.
+///    - If argument is SLM (size), then SLM size in bytes for this resource is provided under pointer on specific index and its type is size_t.
+///    - If argument is an immediate type (i.e. structure, non pointer type), then values under pointer must contain full size of immediate type.
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pArguments`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+typedef struct _ze_group_size_t {
+    uint32_t groupSizeX; ///< [in] local work-group size in X dimension
+    uint32_t groupSizeY; ///< [in] local work-group size in Y dimension
+    uint32_t groupSizeZ; ///< [in] local work-group size in Z dimension
+
+} ze_group_size_t;
+
+ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithArguments(
+    ze_command_list_handle_t hCommandList, ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,            ///< [in] handle of the kernel object
+    const ze_group_count_t groupCounts,    ///< [in] thread group counts
+    const ze_group_size_t groupSizes,      ///< [in] thread group sizes
+    void **pArguments,                     ///< [in] kernel arguments; pointer to list where each argument represents a pointer to the argument value on specific index
+    void *pNext,                           ///< [in][optional] extensions
+    ze_event_handle_t hSignalEvent,        ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                ///< [in][optional] number of events to wait on before launching
+    ze_event_handle_t *phWaitEvents);      ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif
diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/ze_stypes.h
new file mode 100644
index 00000000000000..19b1efac292854
--- /dev/null
+++ b/thirdparty/level_zero/ze_stypes.h
@@ -0,0 +1,43 @@
+// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5
+/*
+ * Copyright (C) 2024-2025 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#ifndef _ZE_STYPES_H
+#define _ZE_STYPES_H
+
+#include <ze_api.h>
+#include <zet_api.h>
+
+#include <cstdint>
+using ze_structure_type_ext_t = uint32_t;
+using zet_structure_type_ext_t = uint32_t;
+
+#define ZE_STRUCTURE_TYPE_SYNCHRONIZED_DISPATCH_EXP_DESC static_cast<ze_structure_type_ext_t>(0x00020020)
+#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_COMMUNICATION_DESC static_cast<ze_structure_type_ext_t>(0x00020021)
+#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_DOORBELL_HANDLE_DESC static_cast<ze_structure_type_ext_t>(0x00020022)
+#define ZE_STRUCTURE_TYPE_INTEL_DEVICE_MEDIA_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00020023)
+#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030007)
+#define ZEX_STRUCTURE_DEVICE_MODULE_REGISTER_FILE_EXP static_cast<ze_structure_type_ext_t>(0x00030010)
+#define ZEX_STRUCTURE_KERNEL_REGISTER_FILE_SIZE_EXP static_cast<ze_structure_type_ext_t>(0x00030012)
+#define ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030013)
+#define ZEX_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC static_cast<ze_structure_type_ext_t>(0x00030016)
+#define ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC static_cast<ze_structure_type_ext_t>(0x00030017)
+#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_ALLOCATE_MSIX_HINT_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030018)
+#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x0003001B)
+#define ZE_STRUCTURE_INTEL_DEVICE_MEMORY_CXL_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030019)
+#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC static_cast<ze_structure_type_ext_t>(0x0003001C)
+#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_SYNC_ALLOC_PROPERTIES static_cast<ze_structure_type_ext_t>(0x0003001D)
+#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030027)
+#define ZE_STRUCTURE_TYPE_QUEUE_PRIORITY_DESC static_cast<ze_structure_type_ext_t>(0x00030028)
+
+// Metric structure types
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATE_EXP_PROPERTIES static_cast<zet_structure_type_ext_t>(0x00010008)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATE_DESC_EXP static_cast<zet_structure_type_ext_t>(0x00010009)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_SOURCE_ID_EXP static_cast<zet_structure_type_ext_t>(0x0001000a)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_DECODED_BUFFER_PROPERTIES_EXP static_cast<zet_structure_type_ext_t>(0x0001000b)
+
+#endif

From ce58599bef5470e4fac3e7ac6c2fe31ca4c2863e Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Thu, 10 Jul 2025 16:05:29 +0000
Subject: [PATCH 02/74] Fix L0 DPAS check

---
 .../include/intel_gpu/runtime/device_info.hpp        | 10 ++++++----
 src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp |  1 +
 src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp   | 12 +++++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index 738515a67b9a1b..d121ace418bd53 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -110,6 +110,8 @@ struct device_info {
     bool supports_usm;                          ///< Does engine support unified shared memory.
     bool has_separate_cache;                    ///< Does the target hardware has separate cache for usm_device and usm_host
 
+    bool supports_cp_offload;                   ///< [L0] Does the command queue support copy offload
+
     std::vector<size_t> supported_simd_sizes;   ///< List of SIMD sizes supported by current device and compiler
 
     uint32_t vendor_id;                         ///< Vendor ID
@@ -131,10 +133,10 @@ struct device_info {
 
     pci_bus_info pci_info;                      ///< PCI bus information for the device
 
-    uint64_t timer_resolution;                  ///< Resolution of device timer used for profiling in cycles/sec
-    uint32_t kernel_timestamp_valid_bits;       ///< Number of valid bits in the kernel timestamp values
-    uint32_t compute_queue_group_ordinal;       ///< Ordinal of the command queue group with compute support
-    uint32_t device_memory_ordinal;             ///< Ordinal of the selected global device memory
+    uint64_t timer_resolution;                  ///< [L0] Resolution of device timer used for profiling in cycles/sec
+    uint32_t kernel_timestamp_valid_bits;       ///< [L0] Number of valid bits in the kernel timestamp values
+    uint32_t compute_queue_group_ordinal;       ///< [L0] Ordinal of the command queue group with compute support
+    uint32_t device_memory_ordinal;             ///< [L0] Ordinal of the selected global device memory
 
     ov::device::UUID uuid;                      ///< UUID of the gpu device
     ov::device::LUID luid;                      ///< LUID of the gpu device
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index e17db33c52e77c..6d65086072afe3 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -348,6 +348,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
     info.kernel_timestamp_valid_bits = 0;
     info.compute_queue_group_ordinal = 0;
     info.device_memory_ordinal = 0;
+    info.supports_cp_offload = false;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     using namespace dnnl::impl::gpu::intel::jit;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index c4fa62668675fe..0974432a0112ec 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -6,6 +6,7 @@
 #include "ze_common.hpp"
 
 #include <ze_api.h>
+#include <ze_intel_gpu.h>
 #include <vector>
 #include <algorithm>
 #include <cassert>
@@ -50,6 +51,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0);
     bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0);
     bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0);
+    bool supports_cp_offload =
+        supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0);
+    bool supports_dp_properties =
+        supports_extension(extensions, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0);
 
     ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0};
     ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr};
@@ -92,6 +97,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     });
 
     ze_device_module_properties_t device_module_properties{ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES};
+    ze_intel_device_module_dp_exp_properties_t dp_properties{ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES, nullptr};
+    if (supports_dp_properties) {
+        device_module_properties.pNext = &dp_properties;
+    }
     ZE_CHECK(zeDeviceGetModuleProperties(device, &device_module_properties));
 
     ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES};
@@ -136,9 +145,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_intel_subgroups_short = true;
     info.supports_intel_subgroups_char = true;
     info.supports_intel_required_subgroup_size = true;
+    info.supports_cp_offload = supports_cp_offload;
 
     info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0;
-    info.supports_immad = false; // FIXME
+    info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0;
 
     info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities;
 

From f38ae587b277fdac3db8fe8e5e655bda2d39ece6 Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Thu, 10 Jul 2025 16:13:45 +0000
Subject: [PATCH 03/74] Use copy offload only when supported

---
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index c7341c9d30b5a5..46c73e8e286fea 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -185,7 +185,9 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES;
     cp_offload_desc.copyOffloadEnabled = true;
     cp_offload_desc.pNext = nullptr;
-    command_queue_desc.pNext = &cp_offload_desc;
+    if (info.supports_cp_offload) {
+        command_queue_desc.pNext = &cp_offload_desc;
+    }
 
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
 }

From 8aaec53e819a17222962438e1ba0e31904af3550 Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Fri, 11 Jul 2025 13:26:04 +0000
Subject: [PATCH 04/74] Remove dlopen for L0

---
 .../intel_gpu/src/runtime/ze/ze_common.cpp    | 39 -------------------
 .../intel_gpu/src/runtime/ze/ze_common.hpp    |  7 ----
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    | 13 +------
 3 files changed, 1 insertion(+), 58 deletions(-)
 delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp
deleted file mode 100644
index fddcda042d7fa0..00000000000000
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "ze_common.hpp"
-#include "openvino/core/except.hpp"
-
-#if defined(__linux__)
-#include <dlfcn.h>
-#elif defined(_WIN32)
-#include "windows.h"
-#else
-#error "Level Zero is supported on Linux and Windows only"
-#endif
-
-namespace cldnn {
-namespace ze {
-
-void *find_ze_symbol(const char *symbol) {
-#if defined(__linux__)
-    void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32)
-    HMODULE handle = LoadLibraryA("ze_loader.dll");
-#endif
-    if (!handle) {
-        return nullptr;
-    }
-
-#if defined(__linux__)
-    void *f = dlsym(handle, symbol);
-#elif defined(_WIN32)
-    void *f = GetProcAddress(handle, symbol);
-#endif
-    OPENVINO_ASSERT(f != nullptr);
-    return f;
-}
-
-}  // namespace ze
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
index 562167458288f3..fc7f98810611e7 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
@@ -21,12 +21,5 @@ namespace ze {
 
 static constexpr uint64_t default_timeout = std::numeric_limits<uint64_t>::max();
 
-void* find_ze_symbol(const char *symbol);
-
-template <typename F>
-F find_ze_symbol(const char *symbol) {
-    return (F)find_ze_symbol(symbol);
-}
-
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index a493ac23005ab0..444169d3dda779 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -23,17 +23,6 @@ namespace ze {
 
 namespace {
 
-void func_zeModuleCreate(ze_context_handle_t hContext,
-                         ze_device_handle_t hDevice, const ze_module_desc_t *desc,
-                         ze_module_handle_t *phModule,
-                         ze_module_build_log_handle_t *phBuildLog) {
-    static auto f = find_ze_symbol<decltype(&zeModuleCreate)>("zeModuleCreate");
-
-    if (!f)
-        throw std::runtime_error("zeModuleCreate was not found");
-    ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog));
-}
-
 ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& engine, std::vector<uint8_t> binary) {
     auto desc = ze_module_desc_t();
     desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
@@ -47,7 +36,7 @@ ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine&
 
     auto ze_device = engine.get_device();
     auto ze_ctx = engine.get_context();
-    func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr);
+    zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr);
     return ze_module;
 }
 

From f8eb99a5d2e4925cc257a36d8917435a704882a8 Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Mon, 4 Aug 2025 13:56:24 +0000
Subject: [PATCH 05/74] Add dep_events for l0 mem fill

---
 .../intel_gpu/src/runtime/ze/ze_memory.cpp    | 28 ++++++++++++++++---
 .../intel_gpu/src/runtime/ze/ze_memory.hpp    |  4 +--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 3b65bf73e3c2a7..49048af8572705 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -23,6 +23,18 @@ static inline cldnn::event::ptr create_event(stream& stream, size_t bytes_count)
     return stream.create_base_event();
 }
 
+std::vector<ze_event_handle_t> get_ze_events(const std::vector<event::ptr>& events) {
+    std::vector<ze_event_handle_t> ze_events;
+    ze_events.reserve(events.size());
+     for (const auto& ev : events) {
+        auto ze_event = downcast<ze::ze_base_event>(ev.get())->get();
+        if (ze_event != nullptr) {
+            ze_events.push_back(ze_event);
+        }
+    }
+    return ze_events;
+}
+
 }  // namespace
 
 allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) {
@@ -123,12 +135,20 @@ void gpu_usm::unlock(const stream& /* stream */) {
     }
 }
 
-event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) {
+event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vector<event::ptr>& dep_events, bool blocking) {
     auto& _ze_stream = downcast<ze_stream>(stream);
     auto ev = _ze_stream.create_base_event();
     auto ev_ze = downcast<ze::ze_base_event>(ev.get())->get();
     std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
-    ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), _buffer.get(), temp_buffer.data(), 1, _bytes_count, ev_ze, 0, nullptr));
+    auto ze_dep_events = get_ze_events(dep_events);
+    ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(),
+        _buffer.get(),
+        temp_buffer.data(),
+        1,
+        _bytes_count,
+        ev_ze,
+        ze_dep_events.size(),
+        ze_dep_events.data()));
 
     if (blocking) {
         ev->wait();
@@ -136,8 +156,8 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) {
     return ev;
 }
 
-event::ptr gpu_usm::fill(stream& stream, bool blocking) {
-    return fill(stream, 0, blocking);
+event::ptr gpu_usm::fill(stream& stream, const std::vector<event::ptr>& dep_events, bool blocking) {
+    return fill(stream, 0, dep_events, blocking);
 }
 
 event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
index b1914de00bec49..acb694b2eb4b2f 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
@@ -132,8 +132,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
     const ze::UsmMemory& get_buffer() const { return _buffer; }
     ze::UsmMemory& get_buffer() { return _buffer; }
 
-    event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override;
-    event::ptr fill(stream& stream, bool blocking = true) override;
+    event::ptr fill(stream& stream, unsigned char pattern, const std::vector<event::ptr>& dep_events = {}, bool blocking = true) override;
+    event::ptr fill(stream& stream, const std::vector<event::ptr>& dep_events = {}, bool blocking = true) override;
     shared_mem_params get_internal_params() const override;
     void* buffer_ptr() const override { return _buffer.get(); }
 

From 44d0a7962661036ad962bb371cdbe4b240bdabbc Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Tue, 5 Aug 2025 15:33:14 +0000
Subject: [PATCH 06/74] Add OneDNN with L0 support

---
 cmake/features.cmake                               |  5 +----
 src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 11 +++++++++--
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt    |  6 +++---
 src/plugins/intel_gpu/thirdparty/onednn_gpu        |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/cmake/features.cmake b/cmake/features.cmake
index 13aa0d20c77c14..346e0cce17fa43 100644
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -41,13 +41,10 @@ else()
     set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON)
 endif()
 
+# Set default GPU runtime to L0 for now
 set(OV_GPU_DEFAULT_RT "L0")
 if (ENABLE_INTEL_GPU)
     ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL)
-    if (GPU_RT_TYPE STREQUAL "L0")
-        # There's no interop with native L0 in onednn API. Temporary disable onednn when L0 runtime is selected
-        set(ENABLE_ONEDNN_FOR_GPU_DEFAULT OFF)
-    endif()
 endif()
 
 ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF)
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index 444169d3dda779..f9e1a4874324ab 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -17,7 +17,9 @@
 #include <memory>
 #include <stdexcept>
 
-
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include <oneapi/dnnl/dnnl_l0.hpp>
+#endif
 namespace cldnn {
 namespace ze {
 
@@ -54,7 +56,12 @@ ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type)
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
 void ze_engine::create_onednn_engine(const ExecutionConfig& config) {
-    OPENVINO_NOT_IMPLEMENTED;
+    const std::lock_guard<std::mutex> lock(onednn_mutex);
+    OPENVINO_ASSERT(_device->get_info().vendor_id == INTEL_VENDOR_ID, "[GPU] OneDNN engine can be used for Intel GPUs only");
+    if (!_onednn_engine) {
+        auto casted = std::dynamic_pointer_cast<ze_device>(_device);
+        _onednn_engine = std::make_shared<dnnl::engine>(dnnl::l0_interop::make_engine(casted->get_driver(), casted->get_device(), casted->get_context()));
+    }
 }
 
 dnnl::engine& ze_engine::get_onednn_engine() const {
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index e47b2b1ffdbb3a..4a41b9463bfb55 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -9,7 +9,7 @@ set(XETLA_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/xetla/include/" CACHE PATH "P
 #
 
 if(ENABLE_ONEDNN_FOR_GPU)
-    function(build_onednn_gpu)
+    function(build_onednn_gpu GPU_RUNTIME)
         include(ExternalProject)
         set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build")
         set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library")
@@ -113,7 +113,7 @@ if(ENABLE_ONEDNN_FOR_GPU)
                 "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW"
                 "-DDNNL_TARGET_ARCH=${ONEDNN_TARGET_ARCH}"
                 "-DDNNL_CPU_RUNTIME=NONE"
-                "-DDNNL_GPU_RUNTIME=OCL"
+                "-DDNNL_GPU_RUNTIME=${GPU_RUNTIME}"
                 "-DDNNL_LIBRARY_NAME=${DNNL_GPU_LIBRARY_NAME}"
                 "-DCMAKE_INSTALL_PREFIX=${ONEDNN_INSTALL_DIR}"
                 "-DDNNL_ENABLE_CONCURRENT_EXEC=ON"
@@ -188,5 +188,5 @@ if(ENABLE_ONEDNN_FOR_GPU)
                     COMPONENT ${OV_CPACK_COMP_CORE})
         endif()
     endfunction()
-    build_onednn_gpu()
+    build_onednn_gpu(${GPU_RT_TYPE})
 endif()
diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index 2aec7fc8c825ec..16a24c084eb18d 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit 2aec7fc8c825ec8e03441d1dfd66bffa027e362c
+Subproject commit 16a24c084eb18d8a7479f1147ced17f45355ab9d

From c3b1a6735dcc35eb34e615d8f21a64da3fb265cd Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Mon, 11 Aug 2025 16:34:25 +0000
Subject: [PATCH 07/74] Adjust to L0 OneDNN

---
 .../intel_gpu/src/runtime/ocl/ocl_engine.cpp  |  5 ++-
 .../intel_gpu/src/runtime/ocl/ocl_memory.cpp  |  8 ++++
 .../intel_gpu/src/runtime/ocl/ocl_stream.cpp  |  4 ++
 .../intel_gpu/src/runtime/ze/ze_device.cpp    | 45 +++++++++++++++++--
 .../intel_gpu/src/runtime/ze/ze_memory.cpp    | 13 ++++++
 .../intel_gpu/src/runtime/ze/ze_memory.hpp    |  3 ++
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    | 16 +++++++
 .../intel_gpu/src/runtime/ze/ze_stream.hpp    |  2 +-
 src/plugins/intel_gpu/thirdparty/onednn_gpu   |  2 +-
 9 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
index e44ce07c8c0a76..0eec437378a501 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -63,8 +63,11 @@ void ocl_engine::create_onednn_engine(const ExecutionConfig& config) {
     if (!_onednn_engine) {
         auto casted = std::dynamic_pointer_cast<ocl_device>(_device);
         OPENVINO_ASSERT(casted, "[GPU] Invalid device type stored in ocl_engine");
-
+#ifdef OV_GPU_WITH_ZE_RT
+        OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime");
+#else
         _onednn_engine = std::make_shared<dnnl::engine>(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get()));
+#endif
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
index 1528f546181ff7..21bcc0ef20739e 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
@@ -229,7 +229,11 @@ event::ptr gpu_buffer::copy_to(stream& stream, void* data_ptr, size_t src_offset
 dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
     auto onednn_engine = _engine->get_onednn_engine();
     dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE);
+#ifdef OV_GPU_WITH_ZE_RT
+    OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime");
+#else
     dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get());
+#endif
     return dnnl_mem;
 }
 #endif
@@ -653,9 +657,13 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s
 #ifdef ENABLE_ONEDNN_FOR_GPU
 dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
     auto onednn_engine = _engine->get_onednn_engine();
+#ifdef OV_GPU_WITH_ZE_RT
+        OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime");
+#else
     dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
         reinterpret_cast<uint8_t*>(_buffer.get()) + offset);
     return dnnl_mem;
+#endif
 }
 #endif
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
index f5e9b74a5e681b..a0cb17ba40f615 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
@@ -239,7 +239,11 @@ dnnl::stream& ocl_stream::get_onednn_stream() {
     OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue");
     OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices");
     if (!_onednn_stream) {
+#ifdef OV_GPU_WITH_ZE_RT
+        OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime");
+#else
         _onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(_engine.get_onednn_engine(), _command_queue.get()));
+#endif
     }
 
     return *_onednn_stream;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index 0974432a0112ec..339b381c0e2e51 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -24,10 +24,32 @@
 #include <dlfcn.h>
 #endif
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include "gpu/intel/jit/generator.hpp"
+#endif
+
 namespace cldnn {
 namespace ze {
 
 namespace {
+#ifdef ENABLE_ONEDNN_FOR_GPU
+//TODO merge this with ocl_device
+gpu_arch convert_ngen_arch(ngen::HW gpu_arch) {
+    switch (gpu_arch) {
+        case ngen::HW::Gen9: return gpu_arch::gen9;
+        case ngen::HW::Gen11: return gpu_arch::gen11;
+        case ngen::HW::XeLP: return gpu_arch::xe_lp;
+        case ngen::HW::XeHP: return gpu_arch::xe_hp;
+        case ngen::HW::XeHPG: return gpu_arch::xe_hpg;
+        case ngen::HW::XeHPC: return gpu_arch::xe_hpc;
+        case ngen::HW::Xe2: return gpu_arch::xe2;
+        case ngen::HW::Xe3: return gpu_arch::xe3;
+        case ngen::HW::Gen10:
+        case ngen::HW::Unknown: return gpu_arch::unknown;
+    }
+    return gpu_arch::unknown;
+}
+#endif
 
 bool supports_extension(const std::vector<ze_driver_extension_properties_t>& extensions, const std::string& ext_name, uint32_t ext_ver) {
     return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) {
@@ -153,7 +175,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities;
 
     info.gfx_ver = {0, 0, 0}; // could find how to retrieve this from L0 so far
-    info.arch = gpu_arch::unknown;
     info.ip_version = ip_version_properties.ipVersion;
     info.sub_device_idx = (std::numeric_limits<uint32_t>::max)();
 
@@ -208,6 +229,23 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
         }
     }
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    using namespace dnnl::impl::gpu::intel::jit;
+    // Create temporary context just for OneDNN HW detection
+    ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 };
+    ze_context_handle_t context;
+    ZE_CHECK(zeContextCreate(driver, &context_desc, &context));
+    ngen::Product product = ngen::LevelZeroCodeGenerator<ngen::HW::Unknown>::detectHWInfo(context, device);
+    zeContextDestroy(context);
+    info.arch = convert_ngen_arch(ngen::getCore(product.family));
+
+    if (product.family == ngen::ProductFamily::Unknown) {
+        info.supports_immad = false;
+    }
+#else  // ENABLE_ONEDNN_FOR_GPU
+    info.arch = gpu_arch::unknown;
+#endif  // ENABLE_ONEDNN_FOR_GPU
+
     return info;
 }
 
@@ -275,8 +313,9 @@ void ze_device::set_mem_caps(const memory_capabilities& memory_capabilities) {
 }
 
 ze_device::~ze_device() {
-    if (_is_initialized)
-        zeContextDestroy(_context);
+    //FIXME segfault
+    //if (_is_initialized)
+    //    zeContextDestroy(_context);
 }
 
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 49048af8572705..0c774e35b2ce7d 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -11,6 +11,10 @@
 #include <stdexcept>
 #include <vector>
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include <oneapi/dnnl/dnnl_l0.hpp>
+#endif
+
 namespace cldnn {
 namespace ze {
 namespace {
@@ -236,6 +240,15 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s
     return result_event;
 }
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
+    auto onednn_engine = _engine->get_onednn_engine();
+    dnnl::memory dnnl_mem = dnnl::l0_interop::make_memory(desc, onednn_engine,
+        reinterpret_cast<uint8_t*>(_buffer.get()) + offset);
+    return dnnl_mem;
+}
+#endif
+
 shared_mem_params gpu_usm::get_internal_params() const {
     auto casted = downcast<ze_engine>(_engine);
     return {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
index acb694b2eb4b2f..b4368dc43b34e0 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
@@ -140,6 +140,9 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
     event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override;
     event::ptr copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override;
     event::ptr copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const override;
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    dnnl::memory get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const override;
+#endif
 
     static allocation_type detect_allocation_type(const ze_engine* engine, const void* mem_ptr);
     static allocation_type detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer);
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 46c73e8e286fea..c68c06ad0c3cf2 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -23,6 +23,10 @@
 #include <vector>
 #include <memory>
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include <oneapi/dnnl/dnnl_l0.hpp>
+#endif
+
 namespace cldnn {
 namespace ze {
 
@@ -346,5 +350,17 @@ void ze_stream::sync_events(std::vector<event::ptr> const& deps, bool is_output)
     }
 }
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+dnnl::stream& ze_stream::get_onednn_stream() {
+    OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue");
+    OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices");
+    if (!_onednn_stream) {
+        _onednn_stream = std::make_shared<dnnl::stream>(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list));
+    }
+
+    return *_onednn_stream;
+}
+#endif
+
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
index 8af269489fbe3b..6dc270e399844d 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -47,7 +47,7 @@ class ze_stream : public stream {
     event::ptr create_base_event() override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
-    dnnl::stream& get_onednn_stream() override { OPENVINO_NOT_IMPLEMENTED; }
+    dnnl::stream& get_onednn_stream() override;
 #endif
 
 private:
diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index 16a24c084eb18d..55d2ccb866428a 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit 16a24c084eb18d8a7479f1147ced17f45355ab9d
+Subproject commit 55d2ccb866428ace7c2eeff221b60a6c1501c6fb

From 7f79b07cb507f982d543865a29b93f8ec8d029cd Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Tue, 12 Aug 2025 15:53:55 +0000
Subject: [PATCH 08/74] Disable OneDNN pooling

---
 src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp
index 8a308d505cf4b1..81c4c27189cb83 100644
--- a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp
+++ b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp
@@ -16,7 +16,8 @@ using namespace cldnn;
 
 const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<pooling>::get_implementations() {
     static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
-        OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) {
+        //FIXME: Disable for now as there is some issue when creating OneDNN descriptor - returns unimplemented
+        /*OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) {
             const auto& in_layout = node.get_input_layout(0);
             const auto& out_layout = node.get_output_layout(0);
             // Disable this case due to sporadic hang for the following case:
@@ -27,7 +28,7 @@ const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<pooli
             if (in_layout.format == format::byxf && out_layout.format == format::bfyx && ov::element::Type(in_layout.data_type).is_integral_number())
                 return false;
             return true;
-        })
+        })*/
         OV_GPU_GET_INSTANCE_OCL(pooling, shape_types::static_shape)
     };
 

From 0e70375ff44f0d7f46f6f8881a66c5555cda79df Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Wed, 13 Aug 2025 09:18:08 +0000
Subject: [PATCH 09/74] Update L0 OneDNN submodule

---
 src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index 55d2ccb866428a..4292538ad8c45e 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit 55d2ccb866428ace7c2eeff221b60a6c1501c6fb
+Subproject commit 4292538ad8c45e1691c85951c54f4a689b8f6597

From 35e492d39d32a630f09f51f89aa0b534da30d9de Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Wed, 13 Aug 2025 16:54:54 +0000
Subject: [PATCH 10/74] Adjust to new OneDNN

---
 .../intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp     | 2 +-
 src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp       | 2 +-
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
index 66027ff683ce2e..d922ebe02b0da6 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
@@ -136,7 +136,7 @@ struct PrimitiveImplOCL : public cldnn::primitive_impl {
         }
     }
 
-    void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
+    void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const cldnn::engine& e) override {
         OPENVINO_ASSERT(cached_kernel_ids.size() == _order.size());
         for (size_t i = 0; i < cached_kernel_ids.size(); ++i) {
             _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i], e);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
index 15c60862da1c23..160325c1f9921b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
@@ -18,7 +18,7 @@
 #endif
 
 #include "gpu/intel/microkernels/package.hpp"
-#include "gpu/intel/jit/gemm/include/gemmstone/microkernel_provider.hpp"
+#include "gpu/intel/gemm/jit/include/gemmstone/microkernel_provider.hpp"
 #include "gpu/intel/microkernels/shim.hpp"
 #include "common/utils.hpp"
 
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 4a41b9463bfb55..a48569909ba3da 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -160,7 +160,7 @@ if(ENABLE_ONEDNN_FOR_GPU)
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/gemm/include"
+                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen")
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
                             DNNL_DLL

From 6b586d3e3d45971a009b9fe534e2e9130961747c Mon Sep 17 00:00:00 2001
From: Jakub Kasprzak <jakub.kasprzak@intel.com>
Date: Mon, 18 Aug 2025 12:38:02 +0000
Subject: [PATCH 11/74] Fix include

---
 .../functional/remote_tensor_tests/ze_remote_tensor_tests.cpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
index e9ccd21cc892be..f1a0b17b85e565 100644
--- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
+++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
@@ -8,7 +8,7 @@
 #include "openvino/runtime/remote_tensor.hpp"
 
 #include "remote_tensor_tests/helpers.hpp"
-#include "base/ov_behavior_test_utils.hpp"
+#include "shared_test_classes/base/ov_behavior_test_utils.hpp"
 
 TEST(ZeRemoteContext, smoke_CorrectContextType) {
     auto core = ov::Core();

From afe606bf0872ac6aa0ee4d63e4c66b9d63191034 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 29 Aug 2025 16:38:59 +0000
Subject: [PATCH 12/74] Add new OneDNN for L0

---
 src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 4 ++--
 src/plugins/intel_gpu/thirdparty/onednn_gpu          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index 6d65086072afe3..78163899bfda57 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -51,7 +51,7 @@ namespace ocl {
 
 namespace {
 
-#ifdef ENABLE_ONEDNN_FOR_GPU
+#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT)
 gpu_arch convert_ngen_arch(ngen::HW gpu_arch) {
     switch (gpu_arch) {
         case ngen::HW::Gen9: return gpu_arch::gen9;
@@ -350,7 +350,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
     info.device_memory_ordinal = 0;
     info.supports_cp_offload = false;
 
-#ifdef ENABLE_ONEDNN_FOR_GPU
+#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT)
     using namespace dnnl::impl::gpu::intel::jit;
     ngen::Product product = ngen::OpenCLCodeGenerator<ngen::HW::Unknown>::detectHWInfo(context.get(), device.get());
     info.arch = convert_ngen_arch(ngen::getCore(product.family));
diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index 4292538ad8c45e..c720c108a081cf 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit 4292538ad8c45e1691c85951c54f4a689b8f6597
+Subproject commit c720c108a081cf719e326ee942f078d0210cee40

From 66a9c6c0b335e563df8d0157ed9dc73e56382ad6 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 4 Sep 2025 07:40:00 +0000
Subject: [PATCH 13/74] Update L0 OneDNN submodule

---
 src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index c720c108a081cf..f9ad03918013da 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit c720c108a081cf719e326ee942f078d0210cee40
+Subproject commit f9ad03918013daa012de817ec3893ff4500f90a6

From 02d4a15448a7cf84879c9dffcb134aad76ce1334 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 5 Sep 2025 15:18:52 +0000
Subject: [PATCH 14/74] Fix ze_stream impl

---
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index c68c06ad0c3cf2..d6436abc33fdc2 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -197,6 +197,8 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
 }
 
 ze_stream::~ze_stream() {
+    // Destroy OneDNN stream before destroying command list
+    _onednn_stream.reset();
     zeCommandListDestroy(m_command_list);
 }
 
@@ -305,22 +307,30 @@ event::ptr ze_stream::create_base_event() {
     return m_pool.create_event(++m_queue_counter);
 }
 
-void ze_stream::flush() const { }
+void ze_stream::flush() const {
+    //Immediate Command List submits commands immediately - no flush impl
+}
 
 void ze_stream::finish() const {
     ZE_CHECK(zeCommandListHostSynchronize(m_command_list, default_timeout));
 }
 
 void ze_stream::wait_for_events(const std::vector<event::ptr>& events) {
+    bool needs_sync = false;
     for (auto& ev : events) {
+        auto* ze_base_ev = dynamic_cast<ze_base_event*>(ev.get());
+        if (ze_base_ev->get() != nullptr) {
+            ze_base_ev->wait();
+        } else {
+            needs_sync = true;
+        }
+        // Block thread and wait for event signal
         ev->wait();
     }
 
-    // Enqueue additional event as `events` may contain user events only due to barrier based synchronization
-    // TODO: Detect that scenarion somehow and don't enqueue extra barrier if not needed
-    auto ev = std::dynamic_pointer_cast<ze_event>(create_base_event());
-    ZE_CHECK(zeCommandListAppendBarrier(m_command_list, ev->get(), 0, nullptr));
-    ev->wait();
+    if (needs_sync) {
+        finish();
+    }
 }
 
 void ze_stream::sync_events(std::vector<event::ptr> const& deps, bool is_output) {

From 40d7e4b58f585a396dd7f5a1b56b8e4c9b96713f Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 10 Sep 2025 15:40:32 +0000
Subject: [PATCH 15/74] Update onednn l0 submodule

---
 src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index f9ad03918013da..a546d2673382e1 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit f9ad03918013daa012de817ec3893ff4500f90a6
+Subproject commit a546d2673382e18adcc0d0c9cbf8337ffc2a5a8c

From b7c56f5c2967f74fea9728f9bfb5a7cea0f05bfa Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 11 Sep 2025 12:58:46 +0000
Subject: [PATCH 16/74] Add dedicated copy queue

---
 .../include/intel_gpu/runtime/device_info.hpp     |  3 ++-
 .../intel_gpu/src/runtime/ze/ze_device.cpp        |  7 ++++++-
 .../intel_gpu/src/runtime/ze/ze_memory.cpp        |  8 ++++----
 .../intel_gpu/src/runtime/ze/ze_stream.cpp        | 15 ++++++---------
 .../intel_gpu/src/runtime/ze/ze_stream.hpp        |  8 +++++++-
 5 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index d121ace418bd53..63a6b3eb9e6014 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -135,7 +135,8 @@ struct device_info {
 
     uint64_t timer_resolution;                  ///< [L0] Resolution of device timer used for profiling in cycles/sec
     uint32_t kernel_timestamp_valid_bits;       ///< [L0] Number of valid bits in the kernel timestamp values
-    uint32_t compute_queue_group_ordinal;       ///< [L0] Ordinal of the command queue group with compute support
+    uint32_t compute_queue_group_ordinal;       ///< [L0] Ordinal of the command queue group to use for compute
+    uint32_t copy_queue_group_ordinal;          ///< [L0] Ordinal of the command queue group to use for copy
     uint32_t device_memory_ordinal;             ///< [L0] Ordinal of the selected global device memory
 
     ov::device::UUID uuid;                      ///< UUID of the gpu device
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index 339b381c0e2e51..b462c9da1a8d78 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -96,10 +96,14 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0]));
 
     auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) {
-        return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == true;
+        return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0;
+    });
+    auto copy_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) {
+        return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0 && (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0;
     });
 
     OPENVINO_ASSERT(compute_queue_props != queue_properties.end());
+    OPENVINO_ASSERT(copy_queue_props != queue_properties.end());
 
     uint32_t memory_properties_count = 0;
     ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr));
@@ -190,6 +194,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.kernel_timestamp_valid_bits  = device_properties.kernelTimestampValidBits;
     info.timer_resolution  = device_properties.timerResolution;
     info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props);
+    info.copy_queue_group_ordinal = std::distance(queue_properties.begin(), copy_queue_props);
 
     static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, "");
     static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, "");
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 0c774e35b2ce7d..218c3390a5c566 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -111,7 +111,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re
             }
             GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
             _host_buffer.allocateHost(_bytes_count);
-            ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(),
+            ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_copy_queue(),
                                     _host_buffer.get(),
                                     _buffer.get(),
                                     _bytes_count,
@@ -174,7 +174,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o
     auto src_ptr = reinterpret_cast<const char*>(data_ptr) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
@@ -202,7 +202,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_
     auto src_ptr = reinterpret_cast<const char*>(usm_mem->buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
@@ -226,7 +226,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s
     auto src_ptr = reinterpret_cast<const char*>(buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(data_ptr) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index d6436abc33fdc2..d4274e6d32e4ba 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -185,21 +185,18 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
     command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
 
-    zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {};
-    cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES;
-    cp_offload_desc.copyOffloadEnabled = true;
-    cp_offload_desc.pNext = nullptr;
-    if (info.supports_cp_offload) {
-        command_queue_desc.pNext = &cp_offload_desc;
-    }
-
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
+    command_queue_desc.ordinal = info.copy_queue_group_ordinal;
+    ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list));
 }
 
 ze_stream::~ze_stream() {
     // Destroy OneDNN stream before destroying command list
     _onednn_stream.reset();
-    zeCommandListDestroy(m_command_list);
+    if (m_command_list != nullptr)
+        zeCommandListDestroy(m_command_list);
+    if (m_copy_command_list != nullptr)
+        zeCommandListDestroy(m_copy_command_list);
 }
 
 void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
index 6dc270e399844d..e490ee67e864ec 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -16,16 +16,21 @@ namespace ze {
 class ze_stream : public stream {
 public:
     ze_command_list_handle_t get_queue() const { return m_command_list; }
+    ze_command_list_handle_t get_copy_queue() const { return m_copy_command_list; }
 
     ze_stream(const ze_engine& engine, const ExecutionConfig& config);
     ze_stream(ze_stream&& other)
         : stream(other.m_queue_type, other.m_sync_method)
         , _engine(other._engine)
         , m_command_list(other.m_command_list)
+        , m_copy_command_list(other.m_copy_command_list)
         , m_queue_counter(other.m_queue_counter.load())
         , m_last_barrier(other.m_last_barrier.load())
         , m_last_barrier_ev(other.m_last_barrier_ev)
-        , m_pool(other.m_pool) {}
+        , m_pool(other.m_pool) {
+            other.m_command_list = nullptr;
+            other.m_copy_command_list = nullptr;
+        }
 
     ~ze_stream();
 
@@ -55,6 +60,7 @@ class ze_stream : public stream {
 
     const ze_engine& _engine;
     mutable ze_command_list_handle_t m_command_list = 0;
+    mutable ze_command_list_handle_t m_copy_command_list = 0;
     mutable std::atomic<uint64_t> m_queue_counter{0};
     std::atomic<uint64_t> m_last_barrier{0};
     std::shared_ptr<ze_event> m_last_barrier_ev = nullptr;

From ab762bfec9e33468568c7300ea44439f3395942b Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 12 Sep 2025 14:03:54 +0000
Subject: [PATCH 17/74] Fix windows compilation

---
 src/plugins/intel_gpu/CMakeLists.txt                 | 4 ++++
 src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 4 ++--
 src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp   | 3 +++
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt      | 8 +++++++-
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt
index 58f5e74330a4d4..38e3dafbc3305f 100644
--- a/src/plugins/intel_gpu/CMakeLists.txt
+++ b/src/plugins/intel_gpu/CMakeLists.txt
@@ -38,6 +38,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
 endif()
 
+if(WIN32)
+    add_definitions(-DNOMINMAX)
+endif()
+
 if(ENABLE_GPU_DEBUG_CAPS)
     add_definitions(-DGPU_DEBUG_CONFIG=1)
     add_definitions(-DENABLE_DEBUG_CAPS=1)
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index 78163899bfda57..09a8d8794d1c68 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#ifdef ENABLE_ONEDNN_FOR_GPU
+#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT)
 #ifndef NOMINMAX
 # define NOMINMAX
 #endif
 #include "gpu/intel/jit/generator.hpp"
-#endif  // ENABLE_ONEDNN_FOR_GPU
+#endif
 
 #include "ocl_device.hpp"
 #include "ocl_common.hpp"
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index b462c9da1a8d78..6cef4bf50c0e5c 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -13,6 +13,9 @@
 
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
 #include <windows.h>
 #include <SetupAPI.h>
 #include <devguid.h>
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index a48569909ba3da..c774352db96325 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -155,14 +155,20 @@ if(ENABLE_ONEDNN_FOR_GPU)
                 DEPENDEES install  # Ensures this runs after install
             )
         endif()
-
+        if(GPU_RUNTIME STREQUAL "L0")
+            set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_L0)
+        elseif(GPU_RUNTIME STREQUAL "OCL")
+            set(DNNL_GPU_RUNTIME_VALUE  DNNL_RUNTIME_OCL)
+        endif()
         set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include"
+                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party"
                              "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen")
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
+                            DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE}
                             DNNL_DLL
                             DNNL_DLL_EXPORTS
                             DNNL_ENABLE_CPU_ISA_HINTS

From 8f48750bd1dee0d0ed948c962f267c65c638068d Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 16 Sep 2025 15:03:50 +0000
Subject: [PATCH 18/74] Fix onednn_gpu submodule

---
 .gitmodules                                 | 2 +-
 src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index d9733bc0d844c1..3b95d663eabc6b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -51,7 +51,7 @@
 	url = https://github.com/nithinn/ncc.git
 [submodule "thirdparty/onednn_gpu"]
 	path = src/plugins/intel_gpu/thirdparty/onednn_gpu
-	url = https://github.com/oneapi-src/oneDNN.git
+	url = https://github.com/jkasprza/oneDNN.git
 [submodule "thirdparty/json/nlohmann_json"]
 	path = thirdparty/json/nlohmann_json
 	url = https://github.com/nlohmann/json.git
diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index a546d2673382e1..ce0e98bf72a6c7 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit a546d2673382e18adcc0d0c9cbf8337ffc2a5a8c
+Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f

From 862df7e999dcaf65535649cbd1179e130a96d1c3 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 19 Sep 2025 14:18:34 +0000
Subject: [PATCH 19/74] Handle local memory size argument for L0

---
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index d4274e6d32e4ba..1e865cb421d252 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -50,6 +50,14 @@ ze_result_t set_kernel_arg_scalar(ze_kernel_handle_t& kernel, uint32_t idx, cons
     return zeKernelSetArgumentValue(kernel, idx, sizeof(T), &val);
 }
 
+ze_result_t set_kernel_arg_local_memory(ze_kernel_handle_t& kernel, uint32_t idx, size_t size) {
+    if (size == 0)
+        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
+
+    GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set arg " << idx << " local memory size: " << size << std::endl;
+    return zeKernelSetArgumentValue(kernel, idx, size, NULL);
+}
+
 ze_result_t set_kernel_arg(ze_kernel_handle_t& kernel, uint32_t idx, cldnn::memory::cptr mem) {
     if (!mem)
         return ZE_RESULT_ERROR_INVALID_ARGUMENT;
@@ -159,6 +167,11 @@ void set_arguments_impl(ze_kernel_handle_t kernel,
             case args_t::SHAPE_INFO:
                 status = set_kernel_arg(kernel, i, data.shape_info);
                 break;
+            case args_t::LOCAL_MEMORY_SIZE:
+                OPENVINO_ASSERT(args[i].index < data.local_memory_args->size() && data.local_memory_args->at(args[i].index),
+                                "The allocated local memory is necessary to set kernel arguments.");
+                status = set_kernel_arg_local_memory(kernel, i,  data.local_memory_args->at(args[i].index));
+                break;
             default:
                 break;
         }

From fedec683418367775b4832c105a487390c69bd3f Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 24 Sep 2025 14:57:48 +0000
Subject: [PATCH 20/74] Add separate onednn submodule for L0 and OCL

* set ocl runtime as default
---
 .gitmodules                                                | 5 ++++-
 cmake/features.cmake                                       | 4 ++--
 src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp | 4 ++++
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt            | 7 ++++++-
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu             | 1 +
 src/plugins/intel_gpu/thirdparty/onednn_gpu                | 2 +-
 6 files changed, 18 insertions(+), 5 deletions(-)
 create mode 160000 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu

diff --git a/.gitmodules b/.gitmodules
index 3b95d663eabc6b..fdd61a6915652a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -51,7 +51,7 @@
 	url = https://github.com/nithinn/ncc.git
 [submodule "thirdparty/onednn_gpu"]
 	path = src/plugins/intel_gpu/thirdparty/onednn_gpu
-	url = https://github.com/jkasprza/oneDNN.git
+	url = https://github.com/oneapi-src/oneDNN.git
 [submodule "thirdparty/json/nlohmann_json"]
 	path = thirdparty/json/nlohmann_json
 	url = https://github.com/nlohmann/json.git
@@ -93,3 +93,6 @@
 [submodule "src/plugins/intel_cpu/thirdparty/xbyak_riscv"]
 	path = src/plugins/intel_cpu/thirdparty/xbyak_riscv
 	url = https://github.com/herumi/xbyak_riscv.git
+[submodule "src/plugins/intel_gpu/thirdparty/l0_onednn_gpu"]
+	path = src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+	url = https://github.com/jkasprza/oneDNN.git
diff --git a/cmake/features.cmake b/cmake/features.cmake
index 04f8fb6b58395a..39fd5990840cef 100644
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -41,8 +41,8 @@ else()
     set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON)
 endif()
 
-# Set default GPU runtime to L0 for now
-set(OV_GPU_DEFAULT_RT "L0")
+# Set default GPU runtime to OCL
+set(OV_GPU_DEFAULT_RT "OCL")
 if (ENABLE_INTEL_GPU)
     ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL)
 endif()
diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
index 239e1d1b34a49b..6462b386f76a45 100644
--- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
@@ -302,7 +302,11 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine,
 }
 
 std::shared_ptr<cldnn::engine> create_test_engine() {
+#ifdef OV_GPU_WITH_ZE_RT
     auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze);
+#elif OV_GPU_WITH_OCL_RT
+    auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl);
+#endif
 #ifdef ENABLE_ONEDNN_FOR_GPU
     if (ret->get_device_info().supports_immad)
         ret->create_onednn_engine({});
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index c774352db96325..9ebcaaae3c53a8 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -99,11 +99,16 @@ if(ENABLE_ONEDNN_FOR_GPU)
 
         set(onednn_gpu_lib "${CMAKE_STATIC_LIBRARY_PREFIX}${DNNL_GPU_LIBRARY_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
         set(ONEDNN_GPU_LIB_PATH ${ONEDNN_INSTALL_DIR}/lib/${onednn_gpu_lib} CACHE FILEPATH "Path to oneDNN GPU library")
+        if(GPU_RUNTIME STREQUAL "L0")
+            set(ONEDNN_GPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/l0_onednn_gpu CACHE FILEPATH "Path to oneDNN GPU repository")
+        elseif(GPU_RUNTIME STREQUAL "OCL")
+            set(ONEDNN_GPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu CACHE FILEPATH "Path to oneDNN GPU repository")
+        endif()
 
         ExternalProject_Add(onednn_gpu_build
             # Directory Options:
             PREFIX "${ONEDNN_PREFIX_DIR}"
-            SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu"
+            SOURCE_DIR "${ONEDNN_GPU_DIR}"
             BINARY_DIR "${ONEDNN_BUILD_DIR}"
             INSTALL_DIR "${ONEDNN_INSTALL_DIR}"
             # Configure Step Options:
diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
new file mode 160000
index 00000000000000..ce0e98bf72a6c7
--- /dev/null
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -0,0 +1 @@
+Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f
diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
index ce0e98bf72a6c7..d8fb6faac0418b 160000
--- a/src/plugins/intel_gpu/thirdparty/onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f
+Subproject commit d8fb6faac0418b03a598d8f6aaaa865610ff9f40

From 8725b28c821691307f77d9fd51bdf3dd64bffdee Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 25 Sep 2025 10:06:53 +0000
Subject: [PATCH 21/74] Fix onednn include paths

---
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 9ebcaaae3c53a8..0f0a77d83b24e1 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -166,12 +166,12 @@ if(ENABLE_ONEDNN_FOR_GPU)
             set(DNNL_GPU_RUNTIME_VALUE  DNNL_RUNTIME_OCL)
         endif()
         set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen")
+                             "${ONEDNN_GPU_DIR}/src"
+                             "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen"
+                             "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config"
+                             "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include"
+                             "${ONEDNN_GPU_DIR}/third_party"
+                             "${ONEDNN_GPU_DIR}/third_party/ngen")
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
                             DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE}
                             DNNL_DLL

From 4fa5c19f06fc041b2ef721daf531df8eeec78ac2 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 25 Sep 2025 11:12:12 +0000
Subject: [PATCH 22/74] Prevent redundant opencl linking

---
 src/plugins/intel_gpu/cmake/utils.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake
index 1dc0edebb5fe2d..5b61368a5d4742 100644
--- a/src/plugins/intel_gpu/cmake/utils.cmake
+++ b/src/plugins/intel_gpu/cmake/utils.cmake
@@ -8,7 +8,8 @@ function(ov_gpu_set_runtime_interface_for TARGET_NAME)
         target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
     elseif(GPU_RT_TYPE STREQUAL "OCL")
         target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
-        target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL)
+        # Do not link OpenCL as It is already linked to the targets that require it
+        # target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL)
     else()
         message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported")
     endif()

From 933e26202f21199804d6b1e0e6b4a20f618f079e Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 26 Sep 2025 11:29:01 +0000
Subject: [PATCH 23/74] Add new compute runtime l0 headers

---
 thirdparty/level_zero/ze_intel_gpu.h | 501 ++++++++++++++++++++++++---
 thirdparty/level_zero/ze_stypes.h    |  32 +-
 thirdparty/level_zero/zex_common.h   | 241 +++++++++++++
 thirdparty/level_zero/zex_event.h    |  58 ++++
 4 files changed, 780 insertions(+), 52 deletions(-)
 create mode 100644 thirdparty/level_zero/zex_common.h
 create mode 100644 thirdparty/level_zero/zex_event.h

diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/ze_intel_gpu.h
index ea83b8e9f6cad9..350be4de550d74 100644
--- a/thirdparty/level_zero/ze_intel_gpu.h
+++ b/thirdparty/level_zero/ze_intel_gpu.h
@@ -1,4 +1,4 @@
-// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5
+// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac
 /*
  * Copyright (C) 2020-2025 Intel Corporation
  *
@@ -9,7 +9,7 @@
 #ifndef _ZE_INTEL_GPU_H
 #define _ZE_INTEL_GPU_H
 
-#include <ze_api.h>
+#include <level_zero/ze_api.h>
 
 #include "ze_stypes.h"
 
@@ -176,6 +176,17 @@ typedef enum _zex_intel_queue_copy_operations_offload_hint_exp_version_t {
     ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 } zex_intel_queue_copy_operations_offload_hint_exp_version_t;
 
+#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue flag for enabling copy operations offload
+///
+/// If set, try to offload copy operations to different engines. Applicable only for compute queues.
+/// This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics.
+#define ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT ZE_BIT(2)
+
+#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+
 #ifndef ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME
 /// @brief Extension name for query to read the Intel Level Zero Driver Version String
 #define ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME "ZE_intel_get_driver_version_string"
@@ -214,10 +225,10 @@ typedef enum _ze_intel_device_block_array_exp_flag_t {
 /// @brief Device 2D block array properties
 
 typedef struct _ze_intel_device_block_array_exp_properties_t {
-    ze_structure_type_ext_t stype = ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES; ///< [in] type of this structure
-    void *pNext;                                                                ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                                                ///< structure (i.e. contains sType and pNext).
-    ze_intel_device_block_array_exp_flags_t flags;                              ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t
+    ze_structure_type_ext_t stype;                 ///< [in] type of this structure
+    void *pNext;                                   ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                   ///< structure (i.e. contains sType and pNext).
+    ze_intel_device_block_array_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t
 } ze_intel_device_block_array_exp_properties_t;
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -238,7 +249,7 @@ typedef enum _ze_intel_device_block_array_exp_properties_version_t {
 ///     - Major.Minor.Patch+Optional per semver guidelines https://semver.org/#spec-item-10
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
-ze_result_t ZE_APICALL
+ZE_APIEXPORT ze_result_t ZE_APICALL
 zeIntelGetDriverVersionString(
     ze_driver_handle_t hDriver, ///< [in] Driver handle whose version is being read.
     char *pDriverVersion,       ///< [in,out] pointer to driver version string.
@@ -255,6 +266,80 @@ zeIntelGetDriverVersionString(
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
 
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZEX_MEMORY_FREE_CALLBACK_EXT_NAME
+/// @brief Memory Free Callback Extension Name
+#define ZEX_MEMORY_FREE_CALLBACK_EXT_NAME "ZEX_extension_memory_free_callback"
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory Free Callback Extension Version(s)
+typedef enum _zex_memory_free_callback_ext_version_t {
+    ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_FORCE_UINT32 = 0x7fffffff        ///< Value marking end of ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_* ENUMs
+
+} zex_memory_free_callback_ext_version_t;
+
+#ifndef ZEX_STRUCTURE_TYPE_MEMORY_FREE_CALLBACK_EXT_DESC
+/// @brief stype for _zex_memory_free_callback_ext_desc_t
+#endif
+
+/**
+ * @brief Callback function type for memory free events.
+ *
+ * This function is called when a memory free operation occurs.
+ *
+ * @param pUserData Pointer to user-defined data passed to the callback.
+ */
+typedef void (*zex_mem_free_callback_fn_t)(void *pUserData);
+
+/**
+ * @brief Descriptor for a memory free callback extension.
+ *
+ * This structure is used to specify a callback function that will be invoked when memory is freed.
+ *
+ * Members:
+ * - stype: Specifies the type of this structure.
+ * - pNext: Optional pointer to an extension-specific structure; must be null or point to a structure containing stype and pNext.
+ * - pfnCallback: Callback function to be called when memory is freed.
+ * - pUserData: Optional user data to be passed to the callback function.
+ */
+typedef struct _zex_memory_free_callback_ext_desc_t {
+    ze_structure_type_ext_t stype;          ///< [in] type of this structure
+    const void *pNext;                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                            ///< structure (i.e. contains stype and pNext).
+    zex_mem_free_callback_fn_t pfnCallback; // [in] callback function to be called on memory free
+    void *pUserData;                        // [in][optional] user data passed to callback
+} zex_memory_free_callback_ext_desc_t;
+
+/**
+ * @brief Registers a callback to be invoked when memory is freed.
+ *
+ * This function allows the user to register a callback that will be called
+ * whenever the specified memory is freed within the given context.
+ *
+ * @param hContext
+ *        [in] Handle to the context in which the memory was allocated.
+ * @param hFreeCallbackDesc
+ *        [in] Pointer to a descriptor specifying the callback function and its parameters.
+ * @param ptr
+ *        [in] Pointer to the memory for which the free callback is to be registered.
+ *
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + `nullptr == hFreeCallbackDesc`
+///         + `nullptr == ptr`
+ *
+ * @note The callback will be invoked when the specified memory is freed.
+ */
+ZE_APIEXPORT ze_result_t ZE_APICALL zexMemFreeRegisterCallbackExt(ze_context_handle_t hContext, zex_memory_free_callback_ext_desc_t *hFreeCallbackDesc, void *ptr);
+#endif // ZEX_MEMORY_FREE_CALLBACK_EXT_NAME
+
 #ifndef ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME
 /// @brief Get Kernel Program Binary experimental name
 #define ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME "ZE_intel_experimental_kernel_get_program_binary"
@@ -269,22 +354,233 @@ typedef enum _ze_intel_kernel_get_binary_exp_version_t {
 
 } ze_intel_kernel_get_binary_exp_version_t;
 
-ze_result_t ZE_APICALL
+ZE_APIEXPORT ze_result_t ZE_APICALL
 zeIntelKernelGetBinaryExp(
     ze_kernel_handle_t hKernel, ///< [in] Kernel handle
     size_t *pSize,              ///< [in, out] pointer to variable with size of GEN ISA binary
     char *pKernelBinary         ///< [in,out] pointer to storage area for GEN ISA binary function
 );
 
-/// @brief Get default context associated with driver
+#ifndef ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME
+/// @brief DRM format modifier extension name
+#define ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME "ZE_intel_experimental_drm_format_modifier"
+#endif // ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief DRM format modifier extension Version(s)
+typedef enum _ze_intel_drm_format_modifier_exp_version_t {
+    ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_drm_format_modifier_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image DRM format modifier properties
+///
+/// @details
+///     - This structure may be passed as pNext member of ::ze_image_desc_t,
+///       when using a DRM format modifier.
+///     - Properties struct for providing user with the selected drm format modifier for the image
+///     - This is useful if the application wants to export the image to another API that requires the DRM format modifier
+///     - The application can query the chosen DRM format modifier for the image.
+///     - The application can use this information to choose a DRM format modifier for the image during creation
+typedef struct _ze_intel_image_selected_format_modifier_exp_properties_t {
+    ze_structure_type_t stype;  ///< [in] type of this structure
+    const void *pNext;          ///< [in][optional] must be null or a pointer to an extension-specific
+                                ///< structure (i.e. contains stype and pNext).
+    uint64_t drmFormatModifier; ///< [out] DRM format modifier
+} ze_intel_image_selected_format_modifier_exp_properties_t;
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image DRM format modifier create list
+///
+/// @details
+///     - This structure may be passed as pNext member of ::ze_image_desc_t,
+///       when providing a list of DRM format modifiers to choose from during image creation.
+///     - This is a descriptor for creating image with the specified list of drm format modifier
+///     - If the user passes a list struct, then implementation chooses one from the list of drm modifiers as it sees fit.
+///     - If user wants to pass a single drm modifier then they can set the drmFormatModifierCount to 1 and pass the single drm modifier in pDrmFormatModifiers
+typedef struct _ze_intel_image_format_modifier_create_list_exp_desc_t {
+    ze_structure_type_t stype;       ///< [in] type of this structure
+    const void *pNext;               ///< [in][optional] must be null or a pointer to an extension-specific
+                                     ///< structure (i.e. contains stype and pNext).
+    uint32_t drmFormatModifierCount; ///< [in] number of DRM format modifiers in the list
+    uint64_t *pDrmFormatModifiers;   ///< [in][range(0, drmFormatModifierCount)] array of DRM format modifiers
+} ze_intel_image_format_modifier_create_list_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image DRM format modifier import descriptor
+///
+/// @details
+///     - This structure may be passed as pNext member of ::ze_image_desc_t,
+///       when importing an image with a specific DRM format modifier.
+///     - The pNext chain is setup accordingly in ze_image_desc_t prior to calling zeImageCreate API
+typedef struct _ze_intel_image_format_modifier_import_exp_desc_t {
+    ze_structure_type_t stype;  ///< [in] type of this structure
+    const void *pNext;          ///< [in][optional] must be null or a pointer to an extension-specific
+                                ///< structure (i.e. contains stype and pNext).
+    uint64_t drmFormatModifier; ///< [in] DRM format modifier to use for the image
+} ze_intel_image_format_modifier_import_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Buffer DRM format modifier create list
+///
+/// @details
+///     - This structure may be passed as pNext member of ::ze_device_mem_alloc_desc_t,
+///       when providing a list of DRM format modifiers to choose from during buffer creation.
+///     - This is a descriptor for creating buffer with the specified list of drm format modifier
+///     - If the user passes a list struct, then implementation chooses one from the list of drm modifiers as it sees fit.
+///     - If user wants to pass a single drm modifier then they can set the drmFormatModifierCount to 1 and pass the single drm modifier in pDrmFormatModifiers
+///     - The pNext chain is setup accordingly in ze_device_mem_alloc_desc_t prior to calling zeMemAllocDevice API
+typedef struct _ze_intel_mem_format_modifier_create_list_exp_desc_t {
+    ze_structure_type_t stype;       ///< [in] type of this structure
+    const void *pNext;               ///< [in][optional] must be null or a pointer to an extension-specific
+                                     ///< structure (i.e. contains stype and pNext).
+    uint32_t drmFormatModifierCount; ///< [in] number of DRM format modifiers in the list
+    uint64_t *pDrmFormatModifiers;   ///< [in][range(0, drmFormatModifierCount)] array of DRM format modifiers
+} ze_intel_mem_format_modifier_create_list_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Buffer DRM format modifier import descriptor
+///
+/// @details
+///     - This structure may be passed as pNext member of ::ze_device_mem_alloc_desc_t,
+///       when importing a buffer with a specific DRM format modifier.
+///     - This descriptor must be used in conjunction with ze_external_memory_import_fd_t. If not, implementation will return an error.
+///     - The pNext chain is setup accordingly in ze_device_mem_alloc_desc_t prior to calling zeMemAllocDevice API
+typedef struct _ze_intel_mem_format_modifier_import_exp_desc_t {
+    ze_structure_type_t stype;  ///< [in] type of this structure
+    const void *pNext;          ///< [in][optional] must be null or a pointer to an extension-specific
+                                ///< structure (i.e. contains stype and pNext).
+    uint64_t drmFormatModifier; ///< [in] DRM format modifier to use for the buffer
+} ze_intel_mem_format_modifier_import_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Buffer DRM format modifier properties
+///
+/// @details
+///     - This structure may be passed as pNext member of ::ze_memory_allocation_properties_t,
+///       when querying the DRM format modifier of a buffer.
+///     - Properties struct for providing user with the selected drm format modifier for the buffer
+///     - This is useful if the application wants to export the buffer to another API that requires the DRM format modifier
+///     - The application can query the chosen DRM format modifier for the buffer via zeMemGetAllocProperties API
+typedef struct _ze_intel_mem_selected_format_modifier_exp_properties_t {
+    ze_structure_type_t stype;  ///< [in] type of this structure
+    const void *pNext;          ///< [in][optional] must be null or a pointer to an extension-specific
+                                ///< structure (i.e. contains stype and pNext).
+    uint64_t drmFormatModifier; ///< [out] DRM format modifier
+} ze_intel_mem_selected_format_modifier_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query for supported DRM format modifiers for a given image descriptor
 ///
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
-///     - Default context contains all devices within driver instance
+///     - This function can be used to query supported DRM format modifiers for a specific image description.
+///     - User can use this API in two ways:
+///         1. Set pCount to the address of a uint32_t with value 0 and pDrmFormatModifiers to nullptr
+///            to query just the number of supported DRM format modifiers.
+///         2. Set pCount to the address of a uint32_t with the number of elements in the pDrmFormatModifiers
+///            array to retrieve the list of supported DRM format modifiers.
+///     - The application can use the returned DRM format modifiers to:
+///         1. Create L0 images with supported DRM format modifiers.
+///         2. Compare with DRM format modifiers from other APIs (like Vulkan) to find common
+///            modifiers that work for interop scenarios.
+///
 /// @returns
-///     - Context handle associated with driver
-ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///     - ::ZE_RESULT_ERROR_INVALID_IMAGE_DESC
+///         + The image description doesn't match the device capabilities
+ze_result_t ZE_APICALL
+zeIntelImageGetFormatModifiersSupportedExp(
+    ze_device_handle_t hDevice,        ///< [in] handle of the device
+    const ze_image_desc_t *pImageDesc, ///< [in] pointer to image descriptor
+    uint32_t *pCount,                  ///< [in,out] pointer to the number of DRM format modifiers.
+                                       ///< if count is zero, then the driver shall update the value with the
+                                       ///< total number of supported DRM format modifiers for the image format.
+                                       ///< if count is greater than the number of supported DRM format modifiers,
+                                       ///< then the driver shall update the value with the correct number of supported DRM format modifiers.
+    uint64_t *pDrmFormatModifiers      ///< [in,out][optional][range(0, *pCount)] array of supported DRM format modifiers
+);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query for supported DRM format modifiers for a memory allocation descriptor
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - This function can be used to query supported DRM format modifiers for a specific memory allocation description.
+///     - User can use this API in two ways:
+///         1. Set pCount to the address of a uint32_t with value 0 and pDrmFormatModifiers to nullptr
+///            to query just the number of supported DRM format modifiers.
+///         2. Set pCount to the address of a uint32_t with the number of elements in the pDrmFormatModifiers
+///            array to retrieve the list of supported DRM format modifiers.
+///     - The application can use the returned DRM format modifiers to:
+///         1. Create L0 memory allocations with supported DRM format modifiers.
+///         2. Compare with DRM format modifiers from other APIs (like Vulkan) to find common
+///            modifiers that work for interop scenarios.
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+ze_result_t ZE_APICALL
+zeIntelMemGetFormatModifiersSupportedExp(
+    ze_context_handle_t hContext,                  ///< [in] handle of the context
+    const ze_device_mem_alloc_desc_t *pDeviceDesc, ///< [in] pointer to device memory allocation descriptor
+    size_t size,                                   ///< [in] size in bytes to allocate
+    size_t alignment,                              ///< [in] minimum alignment in bytes for the allocation
+    ze_device_handle_t hDevice,                    ///< [in] handle of the device
+    uint32_t *pCount,                              ///< [in,out] pointer to the number of DRM format modifiers.
+                                                   ///< if count is zero, then the driver shall update the value with the
+                                                   ///< total number of supported DRM format modifiers for the memory allocation.
+                                                   ///< if count is greater than the number of supported DRM format modifiers,
+                                                   ///< then the driver shall update the value with the correct number of supported DRM format modifiers.
+    uint64_t *pDrmFormatModifiers                  ///< [in,out][optional][range(0, *pCount)] array of supported DRM format modifiers
+);
+
+/// @brief Get priority levels
+///
+/// @details
+///    - The application may call this function from simultaneous threads.
+///    - The implementation of this function should be lock-free.
+///    - Returns priority levels supported by the device
+///    - lowestPriority reports the numerical value that corresponds to lowest queue priority
+///    - highesPriority reports the numerical value that corresponds to highest queue priority
+///    - Lower numbers indicate greater priorities
+///    - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority]
+///    - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value
+///    - 0 means default priority
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+ze_result_t ZE_APICALL zeDeviceGetPriorityLevels(
+    ze_device_handle_t hDevice,
+    int32_t *lowestPriority,
+    int32_t *highestPriority);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Descriptor used for setting priority on command queues and immediate command lists.
+/// This structure may be passed as pNext member of ::ze_command_queue_desc_t.
+typedef struct _ze_queue_priority_desc_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    const void *pNext;             ///< [in][optional] must be null or a pointer to an extension-specific structure
+    int priority;                  ///< [in] priority of the queue
+} ze_queue_priority_desc_t;
 
 /// @brief Get default context associated with default driver
 ///
@@ -294,7 +590,7 @@ ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDri
 ///     - Default context contains all devices within default driver instance
 /// @returns
 ///     - Context handle associated with default driver
-ze_context_handle_t ZE_APICALL zerDriverGetDefaultContext();
+ZE_APIEXPORT ze_context_handle_t ZE_APICALL zerGetDefaultContext();
 
 /// @brief Get Device Identifier
 ///
@@ -302,10 +598,10 @@ ze_context_handle_t ZE_APICALL zerDriverGetDefaultContext();
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
 ///     - Returned identifier is a 32-bit unsigned integer that is unique to the driver.
-///     - The identifier can be used then in zerIdentifierTranslateToDeviceHandle to get the device handle.
+///     - The identifier can be used then in zerTranslateIdentifierToDeviceHandle to get the device handle.
 /// @returns
 ///     - 32-bit unsigned integer identifier
-uint32_t ZE_APICALL zerDeviceTranslateToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device
+ZE_APIEXPORT uint32_t ZE_APICALL zerTranslateDeviceHandleToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device
 
 /// @brief Translate Device Identifier to Device Handle from default Driver
 ///
@@ -315,7 +611,46 @@ uint32_t ZE_APICALL zerDeviceTranslateToIdentifier(ze_device_handle_t hDevice);
 ///    - Returned device is associated to default driver handle.
 /// @returns
 ///     - device handle associated with the identifier
-ze_device_handle_t ZE_APICALL zerIdentifierTranslateToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device
+ZE_APIEXPORT ze_device_handle_t ZE_APICALL zerTranslateIdentifierToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves a string describing the last error code returned by the
+///        default driver in the current thread.
+///
+/// @details
+///     - String returned is thread local.
+///     - String is only updated on calls returning an error, i.e., not on calls
+///       returning ::ZE_RESULT_SUCCESS.
+///     - String may be empty if driver considers error code is already explicit
+///       enough to describe cause.
+///     - Memory pointed to by ppString is owned by the driver.
+///     - String returned is null-terminated.
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ppString`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zerGetLastErrorDescription(
+    const char **ppString ///< [in,out] pointer to a null-terminated array of characters describing
+                          ///< cause of error.
+);
+
+#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+
+/// @brief Get default context associated with driver
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - Default context contains all devices within driver instance
+/// @returns
+///     - Context handle associated with driver
+ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver
 
 /// @brief Global device synchronization
 ///
@@ -333,36 +668,6 @@ ze_device_handle_t ZE_APICALL zerIdentifierTranslateToDeviceHandle(uint32_t iden
 ///     - ::ZE_RESULT_ERROR_DEVICE_LOST
 ze_result_t ZE_APICALL zeDeviceSynchronize(ze_device_handle_t hDevice); ///> [in] handle of the device
 
-/// @brief Get priority levels
-///
-/// @details
-///    - The application may call this function from simultaneous threads.
-///    - The implementation of this function should be lock-free.
-///    - Returns priority levels supported by the device
-///    - lowestPriority reports the numerical value that corresponds to lowest queue priority
-///    - highesPriority reports the numerical value that corresponds to highest queue priority
-///    - Lower numbers indicate greater priorities
-///    - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority]
-///    - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value
-///    - 0 means default priority
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-ze_result_t ZE_APICALL zeDeviceGetPriorityLevels(
-    ze_device_handle_t hDevice,
-    int *lowestPriority,
-    int *highestPriority);
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Descriptor used for setting priority on command queues and immediate command lists.
-/// This structure may be passed as pNext member of ::ze_command_queue_desc_t.
-typedef struct _ze_queue_priority_desc_t {
-    ze_structure_type_t stype; ///< [in] type of this structure
-    const void *pNext;         ///< [in][optional] must be null or a pointer to an extension-specific structure
-    int priority;              ///< [in] priority of the queue
-} ze_queue_priority_desc_t;
-
 /// @brief Append with arguments
 ///
 /// @details
@@ -400,13 +705,115 @@ ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithArguments(
     const ze_group_count_t groupCounts,    ///< [in] thread group counts
     const ze_group_size_t groupSizes,      ///< [in] thread group sizes
     void **pArguments,                     ///< [in] kernel arguments; pointer to list where each argument represents a pointer to the argument value on specific index
-    void *pNext,                           ///< [in][optional] extensions
+    const void *pNext,                     ///< [in][optional] extensions
     ze_event_handle_t hSignalEvent,        ///< [in][optional] handle of the event to signal on completion
     uint32_t numWaitEvents,                ///< [in][optional] number of events to wait on before launching
     ze_event_handle_t *phWaitEvents);      ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension descriptor for cooperative kernel launch via pNext chain.
+/// @details
+///     - This structure can be passed through pNext to zeCommandListAppendLaunchKernelWithParameters
+typedef struct _ze_command_list_append_launch_kernel_param_cooperative_desc_t {
+    ze_structure_type_ext_t stype; ///< [in] Type of this structure (e.g. ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC)
+    const void *pNext;             ///< [in][optional] Pointer to the next extension-specific structure
+    ze_bool_t isCooperative;       ///< [in] Indicates if the kernel should be launched as cooperative
+} ze_command_list_append_launch_kernel_param_cooperative_desc_t;
+/// @brief Append with parameters
+///
+/// @details
+///    - The application may call this function from simultaneous threads.
+///    - The implementation of this function should be lock-free.
+///    - Appends kernel to command list with additional parameters via pNext chain.
+///    - Allows passing core and extension descriptors (e.g. cooperative kernel).
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pGroupCounts`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithParameters(
+    ze_command_list_handle_t hCommandList, ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,            ///< [in] handle of the kernel object
+    const ze_group_count_t *pGroupCounts,  ///< [in] thread group launch arguments
+    const void *pNext,                     ///< [in][optional] additional parameters (pNext chain)
+    ze_event_handle_t hSignalEvent,        ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                ///< [in][optional] number of events to wait on before launching
+    ze_event_handle_t *phWaitEvents        ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching
+);
+
+#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
 
+static const ze_device_mem_alloc_desc_t defaultIntelDeviceMemDesc = {
+    ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, // stype
+    nullptr,                                 // pNext
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED,    // flags
+    0                                        // ordinal
+};
+
+static const ze_host_mem_alloc_desc_t defaultIntelHostMemDesc = {
+    ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,                                             // stype
+    nullptr,                                                                           // pNext
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED | ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT // flags
+};
+
+static const ze_command_queue_desc_t defaultIntelCommandQueueDesc = {
+    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,                                     // stype
+    nullptr,                                                                  // pNext
+    0,                                                                        // ordinal
+    0,                                                                        // index
+    ZE_COMMAND_QUEUE_FLAG_IN_ORDER | ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT, // flags
+    ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,                                       // mode
+    ZE_COMMAND_QUEUE_PRIORITY_NORMAL                                          // priority
+};
+
+#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME
+/// @brief External Memory Mapping Extension Name
+#define ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME "ZE_extension_external_memmap_sysmem"
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief External Memory Mapping Extension Version(s)
+typedef enum _ze_external_memmap_sysmem_ext_version_t {
+    ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_FORCE_UINT32 = 0x7fffffff        ///< Value marking end of ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_* ENUMs
+
+} ze_external_memmap_sysmem_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Maps external system memory for an allocation
+///
+/// @details
+///     - This structure may be passed to ::zeMemAllocHost, via the `pNext`
+///       member of ::ze_host_mem_alloc_desc_t to map system memory for a host
+///       allocation.
+///     - The system memory pointer and size being mapped must be page aligned
+///       based on the supported page sizes on the device.
+typedef struct _ze_external_memmap_sysmem_ext_desc_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    const void *pNext;             ///< [in][optional] must be null or a pointer to an extension-specific
+                                   ///< structure (i.e. contains stype and pNext).
+    const void *pSystemMemory;     ///< [in] system memory pointer to map; must be page-aligned.
+    const uint64_t size;           ///< [in] size of the system memory to map; must be page-aligned.
+
+} ze_external_memmap_sysmem_ext_desc_t;
+#endif // ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME
+
+#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
 #endif
diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/ze_stypes.h
index 19b1efac292854..f0557d8dbc0c83 100644
--- a/thirdparty/level_zero/ze_stypes.h
+++ b/thirdparty/level_zero/ze_stypes.h
@@ -1,4 +1,4 @@
-// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5
+// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac
 /*
  * Copyright (C) 2024-2025 Intel Corporation
  *
@@ -9,17 +9,28 @@
 #ifndef _ZE_STYPES_H
 #define _ZE_STYPES_H
 
-#include <ze_api.h>
-#include <zet_api.h>
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+#include <level_zero/zet_api.h>
 
 #include <cstdint>
 using ze_structure_type_ext_t = uint32_t;
 using zet_structure_type_ext_t = uint32_t;
+using zes_structure_type_ext_t = uint32_t;
 
 #define ZE_STRUCTURE_TYPE_SYNCHRONIZED_DISPATCH_EXP_DESC static_cast<ze_structure_type_ext_t>(0x00020020)
 #define ZE_STRUCTURE_TYPE_INTEL_MEDIA_COMMUNICATION_DESC static_cast<ze_structure_type_ext_t>(0x00020021)
 #define ZE_STRUCTURE_TYPE_INTEL_MEDIA_DOORBELL_HANDLE_DESC static_cast<ze_structure_type_ext_t>(0x00020022)
 #define ZE_STRUCTURE_TYPE_INTEL_DEVICE_MEDIA_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00020023)
+#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+#define ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC static_cast<ze_structure_type_ext_t>(0x00020036)
+#define ZE_STRUCTURE_TYPE_EXTERNAL_MEMMAP_SYSMEM_EXT_DESC static_cast<ze_structure_type_ext_t>(0x00020037)
+#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13)
+#define ZEX_STRUCTURE_TYPE_LABEL_DESCRIPTOR static_cast<ze_structure_type_ext_t>(0x00030002)
+#define ZEX_STRUCTURE_TYPE_OPERAND_DESCRIPTOR static_cast<ze_structure_type_ext_t>(0x00030003)
+#define ZEX_STRUCTURE_TYPE_VARIABLE_DESCRIPTOR static_cast<ze_structure_type_ext_t>(0x00030004)
+#define ZEX_STRUCTURE_TYPE_TEMP_VARIABLE_DESCRIPTOR static_cast<ze_structure_type_ext_t>(0x00030005)
+#define ZEX_STRUCTURE_TYPE_VARIABLE_INFO static_cast<ze_structure_type_ext_t>(0x00030006)
 #define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030007)
 #define ZEX_STRUCTURE_DEVICE_MODULE_REGISTER_FILE_EXP static_cast<ze_structure_type_ext_t>(0x00030010)
 #define ZEX_STRUCTURE_KERNEL_REGISTER_FILE_SIZE_EXP static_cast<ze_structure_type_ext_t>(0x00030012)
@@ -33,11 +44,22 @@ using zet_structure_type_ext_t = uint32_t;
 #define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_SYNC_ALLOC_PROPERTIES static_cast<ze_structure_type_ext_t>(0x0003001D)
 #define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030027)
 #define ZE_STRUCTURE_TYPE_QUEUE_PRIORITY_DESC static_cast<ze_structure_type_ext_t>(0x00030028)
+#ifndef ZE_RECORD_REPLAY_GRAPH_EXP_NAME
+#define ZE_STRUCTURE_TYPE_RECORD_REPLAY_GRAPH_EXP_PROPERTIES static_cast<ze_structure_type_ext_t>(0x00030029)
+#endif // ZE_RECORD_REPLAY_GRAPH_EXP_NAME
+#define ZEX_STRUCTURE_TYPE_MEMORY_FREE_CALLBACK_EXT_DESC static_cast<ze_structure_type_ext_t>(0x00030030)
 
 // Metric structure types
-#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATE_EXP_PROPERTIES static_cast<zet_structure_type_ext_t>(0x00010008)
-#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATE_DESC_EXP static_cast<zet_structure_type_ext_t>(0x00010009)
+#define ZET_STRUCTURE_TYPE_INTEL_METRIC_SCOPE_PROPERTIES_EXP static_cast<zet_structure_type_ext_t>(0x00010006)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_HW_BUFFER_SIZE_EXP_DESC static_cast<zet_structure_type_ext_t>(0x00010007)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATION_EXP_PROPERTIES static_cast<zet_structure_type_ext_t>(0x00010008)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATION_DESC_EXP static_cast<zet_structure_type_ext_t>(0x00010009)
 #define ZET_INTEL_STRUCTURE_TYPE_METRIC_SOURCE_ID_EXP static_cast<zet_structure_type_ext_t>(0x0001000a)
 #define ZET_INTEL_STRUCTURE_TYPE_METRIC_DECODED_BUFFER_PROPERTIES_EXP static_cast<zet_structure_type_ext_t>(0x0001000b)
+#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULABLE_PROPERTIES_EXP static_cast<zet_structure_type_ext_t>(0x0001000c)
+
+// Sysman structure types
+#define ZES_INTEL_PCI_LINK_SPEED_DOWNGRADE_EXP_STATE static_cast<zes_structure_type_ext_t>(0x00040001)
+#define ZES_INTEL_PCI_LINK_SPEED_DOWNGRADE_EXP_PROPERTIES static_cast<zes_structure_type_ext_t>(0x00040002)
 
 #endif
diff --git a/thirdparty/level_zero/zex_common.h b/thirdparty/level_zero/zex_common.h
new file mode 100644
index 00000000000000..8a45573aaa164a
--- /dev/null
+++ b/thirdparty/level_zero/zex_common.h
@@ -0,0 +1,241 @@
+// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac
+/*
+ * Copyright (C) 2022-2025 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#ifndef _ZEX_COMMON_H
+#define _ZEX_COMMON_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+#include "level_zero/ze_stypes.h"
+#include <level_zero/ze_api.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of command list object
+typedef ze_command_list_handle_t zex_command_list_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of event object
+typedef ze_event_handle_t zex_event_handle_t;
+
+#define ZEX_BIT(_i) (1 << _i)
+
+typedef uint32_t zex_mem_action_scope_flags_t;
+typedef enum _zex_mem_action_scope_flag_t {
+    ZEX_MEM_ACTION_SCOPE_FLAG_SUBDEVICE = ZEX_BIT(0),
+    ZEX_MEM_ACTION_SCOPE_FLAG_DEVICE = ZEX_BIT(1),
+    ZEX_MEM_ACTION_SCOPE_FLAG_HOST = ZEX_BIT(2),
+    ZEX_MEM_ACTION_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff
+} zex_mem_action_scope_flag_t;
+
+typedef uint32_t zex_wait_on_mem_action_flags_t;
+typedef enum _zex_wait_on_mem_action_flag_t {
+    ZEX_WAIT_ON_MEMORY_FLAG_EQUAL = ZEX_BIT(0),
+    ZEX_WAIT_ON_MEMORY_FLAG_NOT_EQUAL = ZEX_BIT(1),
+    ZEX_WAIT_ON_MEMORY_FLAG_GREATER_THAN = ZEX_BIT(2),
+    ZEX_WAIT_ON_MEMORY_FLAG_GREATER_THAN_EQUAL = ZEX_BIT(3),
+    ZEX_WAIT_ON_MEMORY_FLAG_LESSER_THAN = ZEX_BIT(4),
+    ZEX_WAIT_ON_MEMORY_FLAG_LESSER_THAN_EQUAL = ZEX_BIT(5),
+    ZEX_WAIT_ON_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff
+} zex_wait_on_mem_action_flag_t;
+
+typedef struct _zex_wait_on_mem_desc_t {
+    zex_wait_on_mem_action_flags_t actionFlag;
+    zex_mem_action_scope_flags_t waitScope;
+} zex_wait_on_mem_desc_t;
+
+typedef struct _zex_write_to_mem_desc_t {
+    zex_mem_action_scope_flags_t writeScope;
+} zex_write_to_mem_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_SYNCHRONIZED_DISPATCH_EXP_NAME
+/// @brief Synchronized Dispatch extension name
+#define ZE_SYNCHRONIZED_DISPATCH_EXP_NAME "ZE_experimental_synchronized_dispatch"
+#endif // ZE_SYNCHRONIZED_DISPATCH_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Synchronized Dispatch extension version(s)
+typedef enum _ze_synchronized_dispatch_exp_version_t {
+    ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_synchronized_dispatch_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported synchronized dispatch flags
+typedef uint32_t ze_synchronized_dispatch_exp_flags_t;
+typedef enum _ze_synchronized_dispatch_exp_flag_t {
+    ZE_SYNCHRONIZED_DISPATCH_DISABLED_EXP_FLAG = ZE_BIT(0), ///< Non-synchronized dispatch. Must synchronize only with other synchronized dispatches
+    ZE_SYNCHRONIZED_DISPATCH_ENABLED_EXP_FLAG = ZE_BIT(1),  ///< Synchronized dispatch. Must synchronize with all synchronized and non-synchronized dispatches
+    ZE_SYNCHRONIZED_DISPATCH_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_synchronized_dispatch_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_synchronized_dispatch_exp_desc_t
+typedef struct _ze_synchronized_dispatch_exp_desc_t ze_synchronized_dispatch_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue or command list descriptor for synchronized dispatch. This structure may be
+///        passed as pNext member of ::ze_command_queue_desc_t. or ::ze_command_list_desc_t.
+typedef struct _ze_synchronized_dispatch_exp_desc_t {
+    ze_structure_type_ext_t stype;              ///< [in] type of this structure
+    const void *pNext;                          ///< [in][optional] must be null or a pointer to an extension-specific
+                                                ///< structure (i.e. contains stype and pNext).
+    ze_synchronized_dispatch_exp_flags_t flags; ///< [in] mode flags.
+                                                ///< must be valid value of ::ze_synchronized_dispatch_exp_flag_t
+
+} ze_synchronized_dispatch_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_intel_media_communication_desc_t
+typedef struct _ze_intel_media_communication_desc_t ze_intel_media_communication_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief ze_intel_media_communication_desc_t
+typedef struct _ze_intel_media_communication_desc_t {
+    ze_structure_type_ext_t stype;          ///< [in] type of this structure
+    void *pNext;                            ///< [in][optional] must be null or a pointer to an extension-specific, this will be used to extend this in future
+    void *controlSharedMemoryBuffer;        ///< [in] control shared memory buffer pointer, must be USM address
+    uint32_t controlSharedMemoryBufferSize; ///< [in] control shared memory buffer size
+    void *controlBatchBuffer;               ///< [in] control batch buffer pointer, must be USM address
+    uint32_t controlBatchBufferSize;        ///< [in] control batch buffer size
+} ze_intel_media_communication_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_intel_media_doorbell_handle_desc_t
+typedef struct _ze_intel_media_doorbell_handle_desc_t ze_intel_media_doorbell_handle_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief ze_intel_media_doorbell_handle_desc_t
+/// @details Handle of the doorbell. This structure is passed as argument of zeIntelMediaCommunicationCreate and zeIntelMediaCommunicationDestroy
+typedef struct _ze_intel_media_doorbell_handle_desc_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    void *pNext;                   ///< [in][optional] must be null or a pointer to an extension-specific, this will be used to extend this in future
+    void *doorbell;                ///< [in,out] handle of the doorbell
+} ze_intel_media_doorbell_handle_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device media flags
+typedef uint32_t ze_intel_device_media_exp_flags_t;
+typedef enum _ze_intel_device_media_exp_flag_t {
+    ZE_INTEL_DEVICE_MEDIA_SUPPORTS_ENCODING_EXP_FLAG = ZE_BIT(0), ///< Supports encoding
+    ZE_INTEL_DEVICE_MEDIA_SUPPORTS_DECODING_EXP_FLAG = ZE_BIT(1), ///< Supports decoding
+    ZE_INTEL_DEVICE_MEDIA_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+} ze_intel_device_media_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_intel_device_media_exp_properties_t
+typedef struct _ze_intel_device_media_exp_properties_t ze_intel_device_media_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief  May be passed to ze_device_properties_t through pNext.
+typedef struct _ze_intel_device_media_exp_properties_t {
+    ze_structure_type_ext_t stype;           ///< [in] type of this structure
+    const void *pNext;                       ///< [in][optional] must be null or a pointer to an extension-specific
+    ze_intel_device_media_exp_flags_t flags; ///< [out] device media flags
+    uint32_t numEncoderCores;                ///< [out] number of encoder cores
+    uint32_t numDecoderCores;                ///< [out] number of decoder cores
+} ze_intel_device_media_exp_properties_t;
+
+#ifndef ZEX_COUNTER_BASED_EVENT_EXT_NAME
+/// @brief Counter Based Event Extension Name
+#define ZEX_COUNTER_BASED_EVENT_EXT_NAME "ZEX_counter_based_event"
+#endif // ZEX_COUNTER_BASED_EVENT_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Counter Based Event Extension Version(s)
+typedef enum _zex_counter_based_event_version_t {
+    ZEX_COUNTER_BASED_EVENT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZEX_COUNTER_BASED_EVENT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZEX_COUNTER_BASED_EVENT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zex_counter_based_event_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief IPC handle to counter based event
+typedef struct _zex_ipc_counter_based_event_handle_t {
+    char data[ZE_MAX_IPC_HANDLE_SIZE]; ///< [out] Opaque data representing an IPC handle
+} zex_ipc_counter_based_event_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event flags for defining counter-based event
+typedef uint32_t zex_counter_based_event_exp_flags_t;
+typedef enum _zex_counter_based_event_exp_flag_t {
+    ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE = ZE_BIT(0),               ///< Counter-based event is used for immediate command lists (default)
+    ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE = ZE_BIT(1),           ///< Counter-based event is used for non-immediate command lists
+    ZEX_COUNTER_BASED_EVENT_FLAG_HOST_VISIBLE = ZE_BIT(2),            ///< Signals and waits are also visible to host
+    ZEX_COUNTER_BASED_EVENT_FLAG_IPC = ZE_BIT(3),                     ///< Event can be shared across processes for waiting
+    ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP = ZE_BIT(4),        ///< Event contains kernel timestamps
+    ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_MAPPED_TIMESTAMP = ZE_BIT(5), ///< Event contains kernel timestamps synchronized to host time domain.
+                                                                      ///< Cannot be combined with::ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP
+    ZEX_COUNTER_BASED_EVENT_FLAG_GRAPH_EXTERNAL_EVENT = ZE_BIT(6),    ///< Event when is used in graph record & replay, can be used outside
+                                                                      ///< recorded graph for synchronization (using as wait event or for host synchronization)
+    ZEX_COUNTER_BASED_EVENT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zex_counter_based_event_exp_flag_t;
+
+typedef struct _zex_counter_based_event_desc_t {
+    ze_structure_type_ext_t stype;             ///< [in] type of this structure
+    const void *pNext;                         ///< [in][optional] must be null or a pointer to an extension-specific
+    zex_counter_based_event_exp_flags_t flags; ///< [in] counter based event flags.
+                                               ///< Must be 0 (default) or a valid combination of ::zex_counter_based_event_exp_flag_t
+    ze_event_scope_flags_t signalScope;        ///< [in] defines the scope of relevant cache hierarchies to flush on a
+                                               ///< signal action before the event is triggered.
+                                               ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                               ///< default behavior is synchronization within the command list only, no
+                                               ///< additional cache hierarchies are flushed.
+    ze_event_scope_flags_t waitScope;          ///< [in] defines the scope of relevant cache hierarchies to invalidate on
+                                               ///< a wait action after the event is complete.
+                                               ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                               ///< default behavior is synchronization within the command list only, no
+                                               ///< additional cache hierarchies are invalidated.
+} zex_counter_based_event_desc_t;
+
+static const zex_counter_based_event_desc_t defaultIntelCounterBasedEventDesc = {
+    ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC, // stype
+    nullptr,                                // pNext
+    ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE |
+        ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE |
+        ZEX_COUNTER_BASED_EVENT_FLAG_HOST_VISIBLE, // flags
+    ZE_EVENT_SCOPE_FLAG_HOST,                      // signalScope
+    ZE_EVENT_SCOPE_FLAG_DEVICE                     // waitScope
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Initial Counter Based Event synchronization parameters. This structure may be
+///        passed as pNext member of ::zex_counter_based_event_desc_t.
+typedef struct _zex_counter_based_event_external_sync_alloc_properties_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    const void *pNext;             ///< [in][optional] must be null or a pointer to an extension-specific
+    uint64_t *deviceAddress;       ///< [in] device address for external synchronization allocation
+    uint64_t *hostAddress;         ///< [in] host address for external synchronization allocation
+    uint64_t completionValue;      ///< [in] completion value for external synchronization allocation
+} zex_counter_based_event_external_sync_alloc_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Initial Counter Based Event synchronization parameters. This structure may be
+///        passed as pNext member of ::zex_counter_based_event_desc_t.
+typedef struct _zex_counter_based_event_external_storage_properties_t {
+    ze_structure_type_ext_t stype; ///< [in] type of this structure
+    const void *pNext;             ///< [in][optional] must be null or a pointer to an extension-specific
+    uint64_t *deviceAddress;       ///< [in] device address that would be updated with atomic_add upon signaling of this event, must be device USM memory
+    uint64_t incrementValue;       ///< [in] value which would by atomically added upon each completion
+    uint64_t completionValue;      ///< [in] final completion value, when value under deviceAddress is equal or greater then this value then event is considered as completed
+} zex_counter_based_event_external_storage_properties_t;
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZEX_COMMON_EXTENDED_H
diff --git a/thirdparty/level_zero/zex_event.h b/thirdparty/level_zero/zex_event.h
new file mode 100644
index 00000000000000..68b0bfe18bd573
--- /dev/null
+++ b/thirdparty/level_zero/zex_event.h
@@ -0,0 +1,58 @@
+// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac
+/*
+ * Copyright (C) 2023-2025 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#ifndef _ZEX_EVENT_H
+#define _ZEX_EVENT_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+
+#include <level_zero/ze_api.h>
+
+#include "zex_common.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zexEventGetDeviceAddress(
+    ze_event_handle_t event,
+    uint64_t *completionValue,
+    uint64_t *address);
+
+// deprecated
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zexCounterBasedEventCreate(
+    ze_context_handle_t hContext,
+    ze_device_handle_t hDevice,
+    uint64_t *deviceAddress,
+    uint64_t *hostAddress,
+    uint64_t completionValue,
+    const ze_event_desc_t *desc,
+    ze_event_handle_t *phEvent);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexIntelAllocateNetworkInterrupt(ze_context_handle_t hContext, uint32_t &networkInterruptId);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexIntelReleaseNetworkInterrupt(ze_context_handle_t hContext, uint32_t networkInterruptId);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDevice, const zex_counter_based_event_desc_t *desc, ze_event_handle_t *phEvent);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventGetIpcHandle(ze_event_handle_t hEvent, zex_ipc_counter_based_event_handle_t *phIpc);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventOpenIpcHandle(ze_context_handle_t hContext, zex_ipc_counter_based_event_handle_t hIpc, ze_event_handle_t *phEvent);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventCloseIpcHandle(ze_event_handle_t hEvent);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL zexDeviceGetAggregatedCopyOffloadIncrementValue(ze_device_handle_t hDevice, uint32_t *incrementValue);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZEX_EVENT_H

From 3993c80ef44838345e78ed9a3c19c33b17af8ec8 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 26 Sep 2025 11:33:57 +0000
Subject: [PATCH 24/74] Fix includes

---
 thirdparty/level_zero/ze_intel_gpu.h | 2 +-
 thirdparty/level_zero/ze_stypes.h    | 6 +++---
 thirdparty/level_zero/zex_common.h   | 4 ++--
 thirdparty/level_zero/zex_event.h    | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/ze_intel_gpu.h
index 350be4de550d74..d7c9f3f56aa6bf 100644
--- a/thirdparty/level_zero/ze_intel_gpu.h
+++ b/thirdparty/level_zero/ze_intel_gpu.h
@@ -9,7 +9,7 @@
 #ifndef _ZE_INTEL_GPU_H
 #define _ZE_INTEL_GPU_H
 
-#include <level_zero/ze_api.h>
+#include <ze_api.h>
 
 #include "ze_stypes.h"
 
diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/ze_stypes.h
index f0557d8dbc0c83..50c9dbaa4e35c2 100644
--- a/thirdparty/level_zero/ze_stypes.h
+++ b/thirdparty/level_zero/ze_stypes.h
@@ -9,9 +9,9 @@
 #ifndef _ZE_STYPES_H
 #define _ZE_STYPES_H
 
-#include <level_zero/ze_api.h>
-#include <level_zero/zes_api.h>
-#include <level_zero/zet_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+#include <zet_api.h>
 
 #include <cstdint>
 using ze_structure_type_ext_t = uint32_t;
diff --git a/thirdparty/level_zero/zex_common.h b/thirdparty/level_zero/zex_common.h
index 8a45573aaa164a..7c8f050c38166e 100644
--- a/thirdparty/level_zero/zex_common.h
+++ b/thirdparty/level_zero/zex_common.h
@@ -11,8 +11,8 @@
 #if defined(__cplusplus)
 #pragma once
 #endif
-#include "level_zero/ze_stypes.h"
-#include <level_zero/ze_api.h>
+#include "ze_stypes.h"
+#include <ze_api.h>
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/thirdparty/level_zero/zex_event.h b/thirdparty/level_zero/zex_event.h
index 68b0bfe18bd573..ec38aaeba9bd80 100644
--- a/thirdparty/level_zero/zex_event.h
+++ b/thirdparty/level_zero/zex_event.h
@@ -12,7 +12,7 @@
 #pragma once
 #endif
 
-#include <level_zero/ze_api.h>
+#include <ze_api.h>
 
 #include "zex_common.h"
 

From 078d3a3397ceb93dd88ca03b7390a76a021cfc42 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 26 Sep 2025 14:49:42 +0000
Subject: [PATCH 25/74] Add initial counter based event implementation

---
 .../src/runtime/ze/ze_cb_event_manager.cpp    | 43 ++++++++++++++++
 .../src/runtime/ze/ze_cb_event_manager.hpp    | 22 ++++++++
 .../intel_gpu/src/runtime/ze/ze_event.cpp     | 16 +++---
 .../intel_gpu/src/runtime/ze/ze_event.hpp     |  9 ++--
 .../src/runtime/ze/ze_event_manager.hpp       | 32 ++++++++++++
 .../src/runtime/ze/ze_event_pool.cpp          | 50 -------------------
 .../src/runtime/ze/ze_event_pool.hpp          | 20 +-------
 .../src/runtime/ze/ze_event_pool_manager.cpp  | 45 +++++++++++++++++
 .../src/runtime/ze/ze_event_pool_manager.hpp  | 26 ++++++++++
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    | 19 +++++--
 .../intel_gpu/src/runtime/ze/ze_stream.hpp    |  5 +-
 11 files changed, 198 insertions(+), 89 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
new file mode 100644
index 00000000000000..20cc68518a37a6
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_cb_event_manager.hpp"
+#include "ze_common.hpp"
+#include "ze_event.hpp"
+
+#include "zex_event.h"
+
+using namespace cldnn;
+using namespace ze;
+namespace {
+    decltype(zexCounterBasedEventCreate2) *func_zexCounterBasedEventCreate2 = nullptr;
+    void find_function_address(ze_driver_handle_t driver) {
+        ZE_CHECK(zeDriverGetExtensionFunctionAddress(driver,
+                                                "zexCounterBasedEventCreate2",
+                                                reinterpret_cast<void **>(&func_zexCounterBasedEventCreate2)));
+    }
+}
+
+ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, bool enable_profiling)
+    : ze_event_manager(engine, enable_profiling) {
+    if (func_zexCounterBasedEventCreate2 == nullptr) {
+        find_function_address(engine.get_driver());
+    }
+}
+
+ze_cb_event_manager::~ze_cb_event_manager() {}
+
+std::shared_ptr<ze_event> ze_cb_event_manager::create_event(uint64_t queue_stamp) {
+    ze_event_handle_t event;
+    auto desc = defaultIntelCounterBasedEventDesc;
+    if (m_enable_profiling) {
+        desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP;
+    }
+    ZE_CHECK(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event));
+    return std::make_shared<ze_event>(this, event, queue_stamp);
+}
+
+void ze_cb_event_manager::destroy_event(ze_event *event) {
+    zeEventDestroy(event->get());
+}
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
new file mode 100644
index 00000000000000..5e0c2a753cf271
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_event_manager.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// Interface for creating and destroying l0 counter based events
+// Should only be used with in-order queue
+struct ze_cb_event_manager : public ze_event_manager {
+public:
+    ze_cb_event_manager(const ze_engine &engine, bool enable_profiling);
+    ~ze_cb_event_manager();
+    std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) override;
+    void destroy_event(ze_event *event) override;
+};
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
index c5e39be435798f..fd65c57805dffb 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
@@ -47,11 +47,8 @@ bool ze_event::is_set_impl() {
 }
 
 bool ze_event::is_profiled() const {
-    if (m_event != nullptr) {
-        ze_event_pool_flags_t event_pool_flags;
-        auto ev_pool = m_event_pool.get()->m_handle;
-        ZE_CHECK(zeEventPoolGetFlags(ev_pool, &event_pool_flags));
-        return (event_pool_flags & ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) != 0;
+    if (m_event_manager) {
+        return m_event_manager->is_profiling_enabled();
     }
     return false;
 }
@@ -61,7 +58,7 @@ bool ze_event::get_profiling_info_impl(std::list<instrumentation::profiling_inte
         return true;
     }
 
-    const auto& engine = m_event_pool->m_engine;
+    const auto& engine = m_event_manager->get_engine();
     auto device_info = engine.get_device_info();
 
     ze_kernel_timestamp_result_t timestamp{};
@@ -146,7 +143,7 @@ bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_int
     if (_events.empty())
         return false;
 
-    const auto& engine = downcast<ze_event>(_events.front().get())->m_event_pool->m_engine;
+    const auto& engine = downcast<ze_event>(_events.front().get())->m_event_manager->get_engine();
     auto device_info = engine.get_device_info();
 
     auto get_total_exec_time = [&device_info](std::vector<ze_kernel_timestamp_data_t>& all_timestamps) {
@@ -210,7 +207,6 @@ bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_int
 }
 
 ze_event::~ze_event() {
-    if (m_event != nullptr) {
-        zeEventDestroy(m_event);
-    }
+    if (m_event_manager != nullptr)
+        m_event_manager->destroy_event(this);
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
index 5120cf9120ec29..b10d2808698775 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "ze_base_event.hpp"
+#include "ze_event_manager.hpp"
 #include "ze_event_pool.hpp"
 
 #include <vector>
@@ -15,9 +16,10 @@ namespace ze {
 
 struct ze_event : public ze_base_event {
 public:
-    ze_event(ze_event_pool::ptr ev_pool, ze_event_handle_t ev, uint64_t queue_stamp = 0)
+    ze_event(ze_event_manager *ev_manager, ze_event_handle_t ev, uint64_t queue_stamp = 0, std::shared_ptr<ze_event_pool> event_pool = nullptr)
         : ze_base_event(queue_stamp)
-        , m_event_pool(ev_pool)
+        , m_event_manager(ev_manager)
+        , m_event_pool(event_pool)
         , m_event(ev) {}
 
     ze_event_handle_t get() override { return m_event; }
@@ -34,7 +36,8 @@ struct ze_event : public ze_base_event {
     friend struct ze_events;
 
 protected:
-    ze_event_pool::ptr m_event_pool;
+    ze_event_manager *m_event_manager;
+    std::shared_ptr<ze_event_pool> m_event_pool = nullptr;
     ze_event_handle_t m_event;
 };
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
new file mode 100644
index 00000000000000..f2bc0975d7b181
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include "ze_engine.hpp"
+
+namespace cldnn {
+namespace ze {
+
+struct ze_event;
+
+// Interface for creating and destroying Level Zero events
+struct ze_event_manager {
+public:
+    using ptr = std::shared_ptr<ze_event_manager>;
+    ze_event_manager(const ze_engine &engine, bool enable_profiling) : m_engine(engine), m_enable_profiling(enable_profiling) {}
+    const ze_engine& get_engine() const { return m_engine; }
+    bool is_profiling_enabled() const { return m_enable_profiling; }
+
+    virtual ~ze_event_manager() {}
+    virtual std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) = 0;
+    virtual void destroy_event(ze_event *event) = 0;
+protected:
+    const ze_engine& m_engine;
+    bool m_enable_profiling;
+};
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
index 1ae696e0d10824..c83179bf6d720d 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
@@ -24,55 +24,5 @@ ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_even
 ze_event_pool::~ze_event_pool() {
     zeEventPoolDestroy(m_handle);
 }
-
-ze_events_pool::ze_events_pool(const ze_engine& engine, bool enable_profiling)
-    : m_engine(engine)
-    , m_enable_profiling(enable_profiling) { }
-
-std::shared_ptr<ze_event> ze_events_pool::create_event(uint64_t queue_stamp) {
-    if (m_num_used >= m_capacity || !m_current_pool) {
-        m_num_used = 0;
-        ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
-        flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
-        m_current_pool = std::make_shared<ze_event_pool>(m_engine, m_capacity, flags);
-    }
-
-    ze_event_handle_t event;
-    // set signal scope to host to allow wait on the host
-    // TODO: avoid setting scope when event is not used for wait on the host
-    ze_event_desc_t event_desc = {
-        ZE_STRUCTURE_TYPE_EVENT_DESC,
-        nullptr,
-        m_num_used++,
-        ZE_EVENT_SCOPE_FLAG_HOST,
-        0
-    };
-    ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event));
-
-    return std::make_shared<ze_event>(m_current_pool, event, queue_stamp);
-}
-
-std::shared_ptr<ze_event> ze_events_pool::create_user_event() {
-    if (m_num_used_user >= m_capacity || !m_current_user_pool) {
-        m_num_used_user = 0;
-        ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
-        flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
-        m_current_user_pool = std::make_shared<ze_event_pool>(m_engine, m_capacity, flags);
-    }
-    // set signal scope to host to allow wait on the host
-    // TODO: avoid setting scope when event is not used for wait on the host
-    ze_event_handle_t event;
-    ze_event_desc_t event_desc = {
-        ZE_STRUCTURE_TYPE_EVENT_DESC,
-        nullptr,
-        m_num_used_user++,
-        ZE_EVENT_SCOPE_FLAG_HOST,
-        0
-    };
-    ZE_CHECK(zeEventCreate(m_current_user_pool->m_handle, &event_desc, &event));
-
-    return std::make_shared<ze_event>(m_current_user_pool, event);
-}
-
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
index 197e41b862a422..cec1448cf8be8d 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
@@ -23,24 +23,6 @@ struct ze_event_pool {
     const ze_engine& m_engine;
 };
 
-// Helper for events pool management
-// Can hold multiple ze_event_pool objects and track their capacity with realloc when it's needed
-struct ze_events_pool {
-public:
-    ze_events_pool(const ze_engine& engine, bool enable_profiling);
-
-    std::shared_ptr<ze_event> create_event(uint64_t queue_stamp = 0);
-    std::shared_ptr<ze_event> create_user_event();
-
-protected:
-    const ze_engine& m_engine;
-    std::shared_ptr<ze_event_pool> m_current_user_pool = nullptr;
-    std::shared_ptr<ze_event_pool> m_current_pool = nullptr;
-    const uint32_t m_capacity = 100;
-    uint32_t m_num_used = 0;
-    uint32_t m_num_used_user = 0;
-    const bool m_enable_profiling;
-};
-
 }  // namespace ze
 }  // namespace cldnn
+
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
new file mode 100644
index 00000000000000..4cd744feb77c21
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_event_pool_manager.hpp"
+#include "ze_common.hpp"
+#include "ze_event.hpp"
+
+#include "zex_event.h"
+
+using namespace cldnn;
+using namespace ze;
+
+ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity)
+: ze_event_manager(engine, enable_profiling)
+, m_current_pool(nullptr)
+, m_capacity(capacity)
+, m_num_used(0) {}
+
+ze_event_pool_manager::~ze_event_pool_manager() {}
+
+std::shared_ptr<ze_event> ze_event_pool_manager::create_event(uint64_t queue_stamp) {
+    if (m_num_used >= m_capacity || !m_current_pool) {
+        m_num_used = 0;
+        ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
+        flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+        m_current_pool = std::make_shared<ze_event_pool>(m_engine, m_capacity, flags);
+    }
+
+    ze_event_handle_t event;
+    ze_event_desc_t event_desc = {
+        ZE_STRUCTURE_TYPE_EVENT_DESC,
+        nullptr,
+        m_num_used++,
+        ZE_EVENT_SCOPE_FLAG_HOST,
+        0
+    };
+    ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event));
+
+    return std::make_shared<ze_event>(this, event, queue_stamp, m_current_pool);
+}
+
+void ze_event_pool_manager::destroy_event(ze_event *event) {
+    zeEventDestroy(event->get());
+}
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
new file mode 100644
index 00000000000000..1fde599001c7fd
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_event_manager.hpp"
+#include "ze_event_pool.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// Interface for creating and destroying l0 events using event pools
+struct ze_event_pool_manager : public ze_event_manager {
+public:
+    ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255);
+    ~ze_event_pool_manager();
+    std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) override;
+    void destroy_event(ze_event *event) override;
+protected:
+    std::shared_ptr<ze_event_pool> m_current_pool;
+    const uint32_t m_capacity;
+    uint32_t m_num_used;
+};
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 1e865cb421d252..42450c7f6256c5 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -8,7 +8,11 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/properties.hpp"
-#include "ze_event_pool.hpp"
+
+#include "ze_event_manager.hpp"
+#include "ze_cb_event_manager.hpp"
+#include "ze_event_pool_manager.hpp"
+
 #include "ze_event.hpp"
 #include "ze_kernel.hpp"
 #include "ze_memory.hpp"
@@ -185,8 +189,7 @@ void set_arguments_impl(ze_kernel_handle_t kernel,
 
 ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     : stream(config.get_queue_type(), stream::get_expected_sync_method(config))
-    , _engine(engine)
-    , m_pool(engine, config.get_enable_profiling()) {
+    , _engine(engine) {
     const auto &info = engine.get_device_info();
 
     ze_command_queue_desc_t command_queue_desc = {};
@@ -201,6 +204,11 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
     command_queue_desc.ordinal = info.copy_queue_group_ordinal;
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list));
+    if (m_queue_type == QueueTypes::in_order) {
+        m_ev_manager = std::make_unique<ze_cb_event_manager>(engine, config.get_enable_profiling());
+    } else {
+        m_ev_manager = std::make_unique<ze_event_pool_manager>(engine, config.get_enable_profiling());
+    }
 }
 
 ze_stream::~ze_stream() {
@@ -306,7 +314,8 @@ void ze_stream::wait() {
 }
 
 event::ptr ze_stream::create_user_event(bool set) {
-    auto ev = m_pool.create_user_event();
+    // user event should use different api
+    auto ev = m_ev_manager->create_event(++m_queue_counter);
     if (set)
         ev->set();
 
@@ -314,7 +323,7 @@ event::ptr ze_stream::create_user_event(bool set) {
 }
 
 event::ptr ze_stream::create_base_event() {
-    return m_pool.create_event(++m_queue_counter);
+    return m_ev_manager->create_event(++m_queue_counter);
 }
 
 void ze_stream::flush() const {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
index e490ee67e864ec..2888ab696cec0d 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -9,6 +9,7 @@
 #include "ze_common.hpp"
 #include "ze_engine.hpp"
 #include "ze_event.hpp"
+#include "ze_event_manager.hpp"
 
 namespace cldnn {
 namespace ze {
@@ -27,7 +28,7 @@ class ze_stream : public stream {
         , m_queue_counter(other.m_queue_counter.load())
         , m_last_barrier(other.m_last_barrier.load())
         , m_last_barrier_ev(other.m_last_barrier_ev)
-        , m_pool(other.m_pool) {
+        , m_ev_manager(other.m_ev_manager.release()) {
             other.m_command_list = nullptr;
             other.m_copy_command_list = nullptr;
         }
@@ -64,7 +65,7 @@ class ze_stream : public stream {
     mutable std::atomic<uint64_t> m_queue_counter{0};
     std::atomic<uint64_t> m_last_barrier{0};
     std::shared_ptr<ze_event> m_last_barrier_ev = nullptr;
-    ze_events_pool m_pool;
+    std::unique_ptr<ze_event_manager> m_ev_manager;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     std::shared_ptr<dnnl::stream> _onednn_stream = nullptr;

From 08c1555d02fae79cc8d5813f5ad87c1aaf45d969 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 3 Oct 2025 15:19:43 +0000
Subject: [PATCH 26/74] workaround for cb event host signal

---
 .../intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp        | 4 ++--
 .../intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp        | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp           | 6 +++++-
 src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp   | 5 ++++-
 .../intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp      | 4 ++--
 .../intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp      | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp          | 4 ++--
 7 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
index 20cc68518a37a6..936f25a6a70822 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
@@ -19,8 +19,8 @@ namespace {
     }
 }
 
-ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, bool enable_profiling)
-    : ze_event_manager(engine, enable_profiling) {
+ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling)
+    : ze_event_manager(engine, cmd_list, enable_profiling) {
     if (func_zexCounterBasedEventCreate2 == nullptr) {
         find_function_address(engine.get_driver());
     }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
index 5e0c2a753cf271..a5b43580da2b44 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
@@ -13,7 +13,7 @@ namespace ze {
 // Should only be used with in-order queue
 struct ze_cb_event_manager : public ze_event_manager {
 public:
-    ze_cb_event_manager(const ze_engine &engine, bool enable_profiling);
+    ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling);
     ~ze_cb_event_manager();
     std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) override;
     void destroy_event(ze_event *event) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
index fd65c57805dffb..d7df10b74371c7 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
@@ -35,7 +35,11 @@ void ze_event::wait_impl() {
 
 void ze_event::set_impl() {
     if (m_event != nullptr) {
-        ZE_CHECK(zeEventHostSignal(m_event));
+        if (m_event_manager != nullptr) {
+            zeCommandListAppendSignalEvent(m_event_manager->get_cmd_list(), m_event);
+        } else {
+            ZE_CHECK(zeEventHostSignal(m_event));// Does not work with cb events
+        }
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
index f2bc0975d7b181..c852fc295b885c 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
@@ -17,15 +17,18 @@ struct ze_event;
 struct ze_event_manager {
 public:
     using ptr = std::shared_ptr<ze_event_manager>;
-    ze_event_manager(const ze_engine &engine, bool enable_profiling) : m_engine(engine), m_enable_profiling(enable_profiling) {}
+    ze_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling)
+    : m_engine(engine), m_cmd_list(cmd_list), m_enable_profiling(enable_profiling) {}
     const ze_engine& get_engine() const { return m_engine; }
     bool is_profiling_enabled() const { return m_enable_profiling; }
+    ze_command_list_handle_t get_cmd_list() { return m_cmd_list; }
 
     virtual ~ze_event_manager() {}
     virtual std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) = 0;
     virtual void destroy_event(ze_event *event) = 0;
 protected:
     const ze_engine& m_engine;
+    ze_command_list_handle_t m_cmd_list;
     bool m_enable_profiling;
 };
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
index 4cd744feb77c21..a437eb69d54c2a 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
@@ -11,8 +11,8 @@
 using namespace cldnn;
 using namespace ze;
 
-ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity)
-: ze_event_manager(engine, enable_profiling)
+ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity)
+: ze_event_manager(engine, cmd_list, enable_profiling)
 , m_current_pool(nullptr)
 , m_capacity(capacity)
 , m_num_used(0) {}
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
index 1fde599001c7fd..1cac8585e2ea19 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
@@ -13,7 +13,7 @@ namespace ze {
 // Interface for creating and destroying l0 events using event pools
 struct ze_event_pool_manager : public ze_event_manager {
 public:
-    ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255);
+    ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity = 255);
     ~ze_event_pool_manager();
     std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) override;
     void destroy_event(ze_event *event) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 42450c7f6256c5..374f5dbb342985 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -205,9 +205,9 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     command_queue_desc.ordinal = info.copy_queue_group_ordinal;
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list));
     if (m_queue_type == QueueTypes::in_order) {
-        m_ev_manager = std::make_unique<ze_cb_event_manager>(engine, config.get_enable_profiling());
+        m_ev_manager = std::make_unique<ze_cb_event_manager>(engine, m_command_list, config.get_enable_profiling());
     } else {
-        m_ev_manager = std::make_unique<ze_event_pool_manager>(engine, config.get_enable_profiling());
+        m_ev_manager = std::make_unique<ze_event_pool_manager>(engine, m_command_list, config.get_enable_profiling());
     }
 }
 

From 31311c77b050cc81006ecffd3e7ba6fed4956fcb Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 9 Oct 2025 10:04:45 +0000
Subject: [PATCH 27/74] Rework level zero event implementation

---
 .../include/intel_gpu/runtime/device_info.hpp |   3 +-
 .../intel_gpu/src/runtime/ocl/ocl_device.cpp  |   1 +
 .../src/runtime/ze/ze_base_event.hpp          |  30 ++-
 .../src/runtime/ze/ze_base_event_factory.hpp  |  28 +++
 .../intel_gpu/src/runtime/ze/ze_cb_event.cpp  |  72 ++++++
 .../intel_gpu/src/runtime/ze/ze_cb_event.hpp  |  47 ++++
 ...nt_manager.cpp => ze_cb_event_factory.cpp} |  21 +-
 .../src/runtime/ze/ze_cb_event_factory.hpp    |  20 ++
 .../src/runtime/ze/ze_cb_event_manager.hpp    |  22 --
 .../intel_gpu/src/runtime/ze/ze_common.hpp    |   7 +
 .../intel_gpu/src/runtime/ze/ze_device.cpp    |   3 +
 .../src/runtime/ze/ze_empty_event.hpp         |  31 +++
 .../intel_gpu/src/runtime/ze/ze_event.cpp     | 205 +++---------------
 .../intel_gpu/src/runtime/ze/ze_event.hpp     |  88 ++------
 ..._pool_manager.cpp => ze_event_factory.cpp} |  20 +-
 .../src/runtime/ze/ze_event_factory.hpp       |  24 ++
 .../src/runtime/ze/ze_event_manager.hpp       |  35 ---
 .../src/runtime/ze/ze_event_pool.hpp          |   9 +-
 .../src/runtime/ze/ze_event_pool_manager.hpp  |  26 ---
 .../intel_gpu/src/runtime/ze/ze_events.cpp    | 155 +++++++++++++
 .../intel_gpu/src/runtime/ze/ze_events.hpp    |  70 ++++++
 .../intel_gpu/src/runtime/ze/ze_memory.cpp    |  10 +-
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    |  44 ++--
 .../intel_gpu/src/runtime/ze/ze_stream.hpp    |   6 +-
 24 files changed, 588 insertions(+), 389 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp
 rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event_manager.cpp => ze_cb_event_factory.cpp} (63%)
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp
 delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp
 rename src/plugins/intel_gpu/src/runtime/ze/{ze_event_pool_manager.cpp => ze_event_factory.cpp} (52%)
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp
 delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
 delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index 227b51d9b2ecb3..48ca01cb64f1cf 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -115,12 +115,13 @@ struct device_info {
     bool supports_imad;                         ///< Does engine support int8 mad.
     bool supports_immad;                        ///< Does engine support int8 multi mad.
 
-    bool supports_mutable_command_list;         ///< Does the target runtime/device support mutable command list feature
+    bool supports_mutable_command_list;         ///< [L0] Does the target runtime/device support mutable command list feature
 
     bool supports_usm;                          ///< Does engine support unified shared memory.
     bool has_separate_cache;                    ///< Does the target hardware has separate cache for usm_device and usm_host
 
     bool supports_cp_offload;                   ///< [L0] Does the command queue support copy offload
+    bool supports_cb_events;                    ///< [L0] Does the target runtime support counter based events
 
     std::vector<size_t> supported_simd_sizes;   ///< List of SIMD sizes supported by current device and compiler
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index 09a8d8794d1c68..bacddd616d948e 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -349,6 +349,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
     info.compute_queue_group_ordinal = 0;
     info.device_memory_ordinal = 0;
     info.supports_cp_offload = false;
+    info.supports_cb_events = false;
 
 #if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT)
     using namespace dnnl::impl::gpu::intel::jit;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp
index 51c69202678fba..0f121e0b8e45b6 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp
@@ -5,20 +5,40 @@
 #pragma once
 
 #include "intel_gpu/runtime/event.hpp"
+#include "ze_base_event_factory.hpp"
+
 #include <ze_api.h>
+#include <chrono>
+#include <optional>
 
 namespace cldnn {
 namespace ze {
 
+// Base interface for Level Zero events
 struct ze_base_event : public event {
 public:
-    explicit ze_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { }
-    uint64_t get_queue_stamp() const { return _queue_stamp; }
-    void set_queue_stamp(uint64_t val) { _queue_stamp = val; }
-    virtual ze_event_handle_t get() = 0;
+    explicit ze_base_event(uint64_t queue_stamp)
+    : event()
+    , m_queue_stamp(queue_stamp) { }
+    uint64_t get_queue_stamp() const { return m_queue_stamp; }
+    void set_queue_stamp(uint64_t val) { m_queue_stamp = val; }
+
+    virtual ze_event_handle_t get_handle() const = 0;
+    virtual std::optional<ze_kernel_timestamp_result_t> query_timestamp() = 0;
 
 protected:
-    uint64_t _queue_stamp = 0;
+    uint64_t m_queue_stamp = 0;
+
+    static std::chrono::nanoseconds timestamp_to_duration(const device_info &info, const ze_kernel_timestamp_data_t& timestamp) {
+        constexpr double NS_IN_SEC = 1000000000.0;
+        const double timestamp_freq = NS_IN_SEC / info.timer_resolution;
+        const uint64_t timestamp_max_value = ~(-1L << info.kernel_timestamp_valid_bits);
+
+        auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ?
+            (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq
+            : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq;
+        return std::chrono::nanoseconds(static_cast<uint64_t>(d));
+    }
 };
 
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp
new file mode 100644
index 00000000000000..8a3febc28f0393
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_engine.hpp"
+#include "intel_gpu/runtime/event.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// Interface for creating Level Zero events
+struct ze_base_event_factory {
+public:
+    ze_base_event_factory(const ze_engine &engine, bool enable_profiling)
+    : m_engine(engine), m_profiling_enabled(enable_profiling) {}
+    const ze_engine& get_engine() const { return m_engine; }
+    bool is_profiling_enabled() const { return m_profiling_enabled; }
+
+    virtual ~ze_base_event_factory() {}
+    virtual event::ptr create_event(uint64_t queue_stamp) = 0;
+protected:
+    const ze_engine& m_engine;
+    const bool m_profiling_enabled;
+};
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
new file mode 100644
index 00000000000000..c7f71f00a394dc
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_cb_event.hpp"
+#include "ze/ze_common.hpp"
+
+#include <cassert>
+#include <chrono>
+#include <list>
+
+using namespace cldnn;
+using namespace ze;
+
+void ze_cb_event::wait_impl() {
+    ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout));
+}
+
+void ze_cb_event::set_impl() {
+    // Counter based events start in signaled state and can not be signaled from host
+}
+
+bool ze_cb_event::is_set_impl() {
+    auto ret = zeEventQueryStatus(m_event);
+    switch (ret) {
+    case ZE_RESULT_SUCCESS:
+        return true;
+        break;
+    case ZE_RESULT_NOT_READY:
+        return false;
+        break;
+    default:
+        OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret));
+        break;
+    }
+}
+
+ze_event_handle_t ze_cb_event::get_handle() const {
+    return m_event;
+}
+
+std::optional<ze_kernel_timestamp_result_t> ze_cb_event::query_timestamp() {
+    if (!m_factory.is_profiling_enabled()) {
+        return std::nullopt;
+    }
+    ze_kernel_timestamp_result_t timestamp{};
+    ZE_CHECK(zeEventQueryKernelTimestamp(m_event, &timestamp));
+    return timestamp;
+}
+
+bool ze_cb_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
+    auto opt_timestamp = query_timestamp();
+    if (!opt_timestamp.has_value()) {
+        return true;
+    }
+    ze_kernel_timestamp_result_t timestamp = opt_timestamp.value();
+    auto &dev_info = m_factory.get_engine().get_device_info();
+    auto wallclock_time = timestamp_to_duration(dev_info, timestamp.global);
+    auto exec_time = timestamp_to_duration(dev_info, timestamp.context);
+
+    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(timestamp_to_duration(dev_info, timestamp.context));
+    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(wallclock_time - exec_time);
+
+    info.push_back({ instrumentation::profiling_stage::executing, period_exec });
+    info.push_back({ instrumentation::profiling_stage::submission, period_submit });
+
+    return true;
+}
+
+ze_cb_event::~ze_cb_event() {
+    ZE_WARN(zeEventDestroy(m_event));
+}
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp
new file mode 100644
index 00000000000000..08cc5ea34b8d9c
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_base_event.hpp"
+
+namespace cldnn {
+namespace ze {
+
+
+// L0 counter based event.
+// Signaled state is inferred from the number of tasks completed on device.
+// Resetting counter based event is not allowed.
+// Signaling counter based event from host is not allowed.
+// Can only be used with in-order command lists.
+struct ze_cb_event : public ze_base_event {
+public:
+    // Take ownership of counter based event handle
+    ze_cb_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev)
+    : ze_base_event(queue_stamp)
+    , m_factory(factory)
+    , m_event(ev) {
+        // Ensure event handle is not null
+        if (ev == nullptr) {
+            OPENVINO_THROW("[GPU] Trying to create event with null handle");
+        }
+    }
+    ze_cb_event(const ze_cb_event&) = delete;
+    ze_cb_event& operator=(const ze_cb_event&) = delete;
+    ~ze_cb_event();
+
+    void wait_impl() override;
+    void set_impl() override;
+    bool is_set_impl() override;
+    ze_event_handle_t get_handle() const override;
+    std::optional<ze_kernel_timestamp_result_t> query_timestamp() override;
+    bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
+
+protected:
+    const ze_base_event_factory& m_factory;
+    ze_event_handle_t m_event;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
similarity index 63%
rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
rename to src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
index 936f25a6a70822..d42727f6584a9d 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
@@ -2,9 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ze_cb_event_manager.hpp"
+#include "ze_cb_event_factory.hpp"
 #include "ze_common.hpp"
-#include "ze_event.hpp"
+#include "ze_cb_event.hpp"
 
 #include "zex_event.h"
 
@@ -19,25 +19,20 @@ namespace {
     }
 }
 
-ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling)
-    : ze_event_manager(engine, cmd_list, enable_profiling) {
+ze_cb_event_factory::ze_cb_event_factory(const ze_engine &engine, bool enable_profiling)
+    : ze_base_event_factory(engine, enable_profiling) {
     if (func_zexCounterBasedEventCreate2 == nullptr) {
         find_function_address(engine.get_driver());
     }
 }
 
-ze_cb_event_manager::~ze_cb_event_manager() {}
-
-std::shared_ptr<ze_event> ze_cb_event_manager::create_event(uint64_t queue_stamp) {
+event::ptr ze_cb_event_factory::create_event(uint64_t queue_stamp) {
     ze_event_handle_t event;
     auto desc = defaultIntelCounterBasedEventDesc;
-    if (m_enable_profiling) {
+    if (is_profiling_enabled()) {
         desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP;
     }
     ZE_CHECK(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event));
-    return std::make_shared<ze_event>(this, event, queue_stamp);
-}
-
-void ze_cb_event_manager::destroy_event(ze_event *event) {
-    zeEventDestroy(event->get());
+    auto cb_event = std::make_shared<ze_cb_event>(queue_stamp, *this, event);
+    return cb_event;
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp
new file mode 100644
index 00000000000000..0c6bceaacbd57b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_base_event_factory.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// Interface for creating l0 counter based events
+// Should only be used with in-order queue
+struct ze_cb_event_factory : public ze_base_event_factory {
+public:
+    ze_cb_event_factory(const ze_engine &engine, bool enable_profiling);
+    event::ptr create_event(uint64_t queue_stamp) override;
+};
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
deleted file mode 100644
index a5b43580da2b44..00000000000000
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ze_event_manager.hpp"
-
-namespace cldnn {
-namespace ze {
-
-// Interface for creating and destroying l0 counter based events
-// Should only be used with in-order queue
-struct ze_cb_event_manager : public ze_event_manager {
-public:
-    ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling);
-    ~ze_cb_event_manager();
-    std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) override;
-    void destroy_event(ze_event *event) override;
-};
-}  // namespace ze
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
index fc7f98810611e7..36d820139be816 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
@@ -15,6 +15,13 @@
         } \
     } while (false)
 
+#define ZE_WARN(f) \
+    do { \
+        ze_result_t res_ = (f); \
+        if (res_ != ZE_RESULT_SUCCESS) { \
+            GPU_DEBUG_COUT << ("[Warning] [GPU] " #f " command failed with code " + std::to_string(res_)); \
+        } \
+    } while (false)
 
 namespace cldnn {
 namespace ze {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index 6cef4bf50c0e5c..0300ba9ff93f5b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -4,6 +4,7 @@
 
 #include "ze_device.hpp"
 #include "ze_common.hpp"
+#include "zex_common.h"
 
 #include <ze_api.h>
 #include <ze_intel_gpu.h>
@@ -76,6 +77,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0);
     bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0);
     bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0);
+    bool supports_cb_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0);
     bool supports_cp_offload =
         supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0);
     bool supports_dp_properties =
@@ -175,6 +177,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_intel_subgroups_char = true;
     info.supports_intel_required_subgroup_size = true;
     info.supports_cp_offload = supports_cp_offload;
+    info.supports_cb_events = supports_cb_events;
 
     info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0;
     info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp
new file mode 100644
index 00000000000000..0d16bfeb9d2280
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_base_event.hpp"
+
+namespace cldnn {
+namespace ze {
+
+
+// Event that does not have underlying Level Zero event object.
+// It is always in signaled state.
+struct ze_empty_event : public ze_base_event {
+public:
+    ze_empty_event(uint64_t queue_stamp)
+    : ze_base_event(queue_stamp) { }
+
+    void wait_impl() override { }
+    void set_impl() override { }
+    bool is_set_impl() override { return true; }
+    ze_event_handle_t get_handle() const override { return nullptr; }
+    std::optional<ze_kernel_timestamp_result_t> query_timestamp() override { return std::nullopt; }
+    bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override {
+        return true;
+    }
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
index d7df10b74371c7..35069b0692904f 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
@@ -12,197 +12,59 @@
 using namespace cldnn;
 using namespace ze;
 
-namespace {
-
-std::chrono::nanoseconds timestamp_to_duration(const device_info& device_info, const ze_kernel_timestamp_data_t& timestamp) {
-    constexpr double NS_IN_SEC = 1000000000.0;
-    const double timestamp_freq = NS_IN_SEC / device_info.timer_resolution;
-    const uint64_t timestamp_max_value = ~(-1L << device_info.kernel_timestamp_valid_bits);
-
-    auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ? (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq
-                                                            : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq;
-
-    return std::chrono::nanoseconds(static_cast<uint64_t>(d));
+void ze_event::reset() {
+    event::reset();
+    ZE_CHECK(zeEventHostReset(m_event));
 }
 
-}  // namespace
-
 void ze_event::wait_impl() {
-    if (m_event != nullptr) {
-        ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout));
-    }
+    ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout));
 }
 
 void ze_event::set_impl() {
-    if (m_event != nullptr) {
-        if (m_event_manager != nullptr) {
-            zeCommandListAppendSignalEvent(m_event_manager->get_cmd_list(), m_event);
-        } else {
-            ZE_CHECK(zeEventHostSignal(m_event));// Does not work with cb events
-        }
-    }
+    ZE_CHECK(zeEventHostSignal(m_event));
 }
 
 bool ze_event::is_set_impl() {
-    if (m_event != nullptr) {
-        return zeEventQueryStatus(m_event) == ZE_RESULT_SUCCESS;
+    auto ret = zeEventQueryStatus(m_event);
+    switch (ret) {
+    case ZE_RESULT_SUCCESS:
+        return true;
+        break;
+    case ZE_RESULT_NOT_READY:
+        return false;
+        break;
+    default:
+        OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret));
+        break;
     }
-    return true;
 }
 
-bool ze_event::is_profiled() const {
-    if (m_event_manager) {
-        return m_event_manager->is_profiling_enabled();
+std::optional<ze_kernel_timestamp_result_t> ze_event::query_timestamp() {
+    if (!m_factory.is_profiling_enabled()) {
+        return std::nullopt;
     }
-    return false;
-}
-
-bool ze_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
-    if (!is_profiled()) {
-        return true;
-    }
-
-    const auto& engine = m_event_manager->get_engine();
-    auto device_info = engine.get_device_info();
-
     ze_kernel_timestamp_result_t timestamp{};
     ZE_CHECK(zeEventQueryKernelTimestamp(m_event, &timestamp));
-
-    auto wallclock_time = timestamp_to_duration(device_info, timestamp.global);
-    auto exec_time = timestamp_to_duration(device_info, timestamp.context);
-
-    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(timestamp_to_duration(device_info, timestamp.context));
-    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(wallclock_time - exec_time);
-
-    info.push_back({ instrumentation::profiling_stage::executing, period_exec });
-    info.push_back({ instrumentation::profiling_stage::submission, period_submit });
-
-    return true;
+    return timestamp;
 }
 
-void ze_events::wait_impl() {
-    if (_last_ze_event != nullptr) {
-        ZE_CHECK(zeEventHostSynchronize(_last_ze_event, UINT32_MAX));
-    }
+ze_event_handle_t ze_event::get_handle() const {
+    return m_event;
 }
 
-void ze_events::set_impl() {
-    wait_impl();
-}
-
-bool ze_events::is_set_impl() {
-    if (_last_ze_event != nullptr) {
-        return zeEventQueryStatus(_last_ze_event) == ZE_RESULT_SUCCESS;
-    }
-    return true;
-}
-
-bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
-    // The goal is to sum up all disjoint durations of its projection on the time axis
-    std::vector<ze_kernel_timestamp_data_t> all_global_timestamps;
-    std::vector<ze_kernel_timestamp_data_t> all_context_timestamps;
-
-    auto add_or_merge = [](std::vector<ze_kernel_timestamp_data_t>& all_timestamps, const ze_kernel_timestamp_data_t& ts) {
-        auto it = all_timestamps.begin();
-        bool merged = false;
-        auto target_timestamp = ts;
-        while (it != all_timestamps.end()) {
-            auto& timestamp = *it;
-            bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd;
-            bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart;
-            if (!disjoint) {
-                if (equal) {
-                    if (!merged) {
-                        merged = true;
-                        break;
-                    } else {
-                        it = all_timestamps.erase(it);
-                    }
-                } else {
-                    if (!merged) {
-                        timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart);
-                        timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd);
-                        target_timestamp = timestamp;
-                        merged = true;
-                        it++;
-                    } else {
-                        if (timestamp.kernelEnd > target_timestamp.kernelEnd) {
-                            it--;
-                            it->kernelEnd = target_timestamp.kernelEnd;
-                            it++;
-                        }
-                        it = all_timestamps.erase(it);
-                    }
-                }
-            } else {
-                it++;
-            }
-        }
-
-        if (!merged) {
-            all_timestamps.push_back(target_timestamp);
-        }
-    };
-
-    if (_events.empty())
-        return false;
-
-    const auto& engine = downcast<ze_event>(_events.front().get())->m_event_manager->get_engine();
-    auto device_info = engine.get_device_info();
-
-    auto get_total_exec_time = [&device_info](std::vector<ze_kernel_timestamp_data_t>& all_timestamps) {
-        std::chrono::nanoseconds total_time{0};
-        for (const auto& ts : all_timestamps) {
-            total_time += timestamp_to_duration(device_info, ts);
-        }
-
-        return total_time;
-    };
-
-    // Submission time is calculated as difference between merged context and wallclock intervals
-    // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals
-    using intervals_t = std::vector<ze_kernel_timestamp_data_t>;
-    auto get_submission_time = [&device_info](const intervals_t& s_timestamps,
-                                              const intervals_t& e_timestamps) {
-        auto get_minmax = [](const intervals_t& timestamps) {
-            uint64_t min_val = std::min(timestamps.begin(), timestamps.end(),
-                [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) {
-                    return lhs->kernelStart < rhs->kernelStart;
-            })->kernelStart;
-            uint64_t max_val = std::max(timestamps.begin(), timestamps.end(),
-                [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) {
-                    return lhs->kernelEnd < rhs->kernelEnd;
-            })->kernelEnd;
-
-            return ze_kernel_timestamp_data_t{min_val, max_val};
-        };
-
-        auto submission_interval = get_minmax(s_timestamps);
-        auto exec_interval = get_minmax(e_timestamps);
-
-        auto wallclock_time = timestamp_to_duration(device_info, submission_interval);
-        auto exec_time = timestamp_to_duration(device_info, exec_interval);
-
-        return wallclock_time - exec_time;
-    };
-
-    for (size_t i = 0; i < _events.size(); i++) {
-        auto be = downcast<ze_event>(_events[i].get());
-        if (!be->is_profiled()) {
-            continue;
-        }
-        ze_kernel_timestamp_result_t timestamp{};
-        ZE_CHECK(zeEventQueryKernelTimestamp(be->get(), &timestamp));
-
-        add_or_merge(all_global_timestamps, timestamp.global);
-        add_or_merge(all_context_timestamps, timestamp.context);
+bool ze_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
+    auto opt_timestamp = query_timestamp();
+    if (!opt_timestamp.has_value()) {
+        return true;
     }
+    ze_kernel_timestamp_result_t timestamp = opt_timestamp.value();
+    auto &dev_info = m_factory.get_engine().get_device_info();
+    auto wallclock_time = timestamp_to_duration(dev_info, timestamp.global);
+    auto exec_time = timestamp_to_duration(dev_info, timestamp.context);
 
-    auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps);
-    auto exec_time = get_total_exec_time(all_context_timestamps);
-
-    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(exec_time);
-    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(submit_time);
+    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(timestamp_to_duration(dev_info, timestamp.context));
+    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(wallclock_time - exec_time);
 
     info.push_back({ instrumentation::profiling_stage::executing, period_exec });
     info.push_back({ instrumentation::profiling_stage::submission, period_submit });
@@ -211,6 +73,5 @@ bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_int
 }
 
 ze_event::~ze_event() {
-    if (m_event_manager != nullptr)
-        m_event_manager->destroy_event(this);
+    ZE_WARN(zeEventDestroy(m_event));
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
index b10d2808698775..4c7e02ed32ee08 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
@@ -5,92 +5,44 @@
 #pragma once
 
 #include "ze_base_event.hpp"
-#include "ze_event_manager.hpp"
 #include "ze_event_pool.hpp"
 
-#include <vector>
-#include <list>
-
 namespace cldnn {
 namespace ze {
 
+// L0 event. Can be either in signaled state or not signaled state.
 struct ze_event : public ze_base_event {
 public:
-    ze_event(ze_event_manager *ev_manager, ze_event_handle_t ev, uint64_t queue_stamp = 0, std::shared_ptr<ze_event_pool> event_pool = nullptr)
+    // Take ownership of event handle
+    ze_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev, std::shared_ptr<ze_event_pool> event_pool)
         : ze_base_event(queue_stamp)
-        , m_event_manager(ev_manager)
         , m_event_pool(event_pool)
-        , m_event(ev) {}
-
-    ze_event_handle_t get() override { return m_event; }
-    bool is_profiled() const;
-
+        , m_factory(factory)
+        , m_event(ev) {
+            // Ensure event handle is not null
+            if (ev == nullptr) {
+                OPENVINO_THROW("[GPU] Trying to create event with null handle");
+            }
+        }
+    ze_event(const ze_event &) = delete;
+    ze_event& operator=(const ze_event &) = delete;
     ~ze_event();
+    void reset() override;
 
-private:
-    void wait_impl() override;
-    void set_impl() override;
-    bool is_set_impl() override;
+    std::optional<ze_kernel_timestamp_result_t> query_timestamp() override;
+    ze_event_handle_t get_handle() const override;
     bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
 
-    friend struct ze_events;
-
 protected:
-    ze_event_manager *m_event_manager;
-    std::shared_ptr<ze_event_pool> m_event_pool = nullptr;
-    ze_event_handle_t m_event;
-};
-
-struct ze_events : public ze_base_event {
-public:
-    ze_events(std::vector<event::ptr> const& ev)
-        : ze_base_event(0) {
-        process_events(ev);
-    }
-
-    ze_event_handle_t get() override { return _last_ze_event; }
-
-    void reset() override {
-        event::reset();
-        _events.clear();
-    }
-
-private:
     void wait_impl() override;
     void set_impl() override;
     bool is_set_impl() override;
+    // TODO: Implement add_event_handler_impl
+    // bool add_event_handler_impl(event_handler, void*) override;
 
-    void process_events(const std::vector<event::ptr>& ev) {
-        for (size_t i = 0; i < ev.size(); i++) {
-            auto multiple_events = dynamic_cast<ze_events*>(ev[i].get());
-            if (multiple_events) {
-                for (size_t j = 0; j < multiple_events->_events.size(); j++) {
-                    if (auto base_ev = dynamic_cast<ze_event*>(multiple_events->_events[j].get())) {
-                        auto current_ev_queue_stamp = base_ev->get_queue_stamp();
-                        if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
-                            _queue_stamp = current_ev_queue_stamp;
-                            _last_ze_event = base_ev->get();
-                        }
-                    }
-                    _events.push_back(multiple_events->_events[j]);
-                }
-            } else {
-                if (auto base_ev = dynamic_cast<ze_event*>(ev[i].get())) {
-                    auto current_ev_queue_stamp = base_ev->get_queue_stamp();
-                    if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
-                        _queue_stamp = current_ev_queue_stamp;
-                        _last_ze_event = base_ev->get();
-                    }
-                }
-                _events.push_back(ev[i]);
-            }
-        }
-    }
-
-    bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
-
-    ze_event_handle_t _last_ze_event;
-    std::vector<event::ptr> _events;
+    std::shared_ptr<ze_event_pool> m_event_pool;
+    const ze_base_event_factory& m_factory;
+    ze_event_handle_t m_event;
 };
 
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
similarity index 52%
rename from src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
rename to src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
index a437eb69d54c2a..6ec7fc4e8ab3c4 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ze_event_pool_manager.hpp"
+#include "ze_event_factory.hpp"
 #include "ze_common.hpp"
 #include "ze_event.hpp"
 
@@ -11,18 +11,16 @@
 using namespace cldnn;
 using namespace ze;
 
-ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity)
-: ze_event_manager(engine, cmd_list, enable_profiling)
+ze_event_factory::ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity)
+: ze_base_event_factory(engine, enable_profiling)
 , m_current_pool(nullptr)
 , m_capacity(capacity)
-, m_num_used(0) {}
+, m_num_used(0) { }
 
-ze_event_pool_manager::~ze_event_pool_manager() {}
-
-std::shared_ptr<ze_event> ze_event_pool_manager::create_event(uint64_t queue_stamp) {
+event::ptr ze_event_factory::create_event(uint64_t queue_stamp) {
     if (m_num_used >= m_capacity || !m_current_pool) {
         m_num_used = 0;
-        ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
+        ze_event_pool_flags_t flags = is_profiling_enabled() ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
         flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
         m_current_pool = std::make_shared<ze_event_pool>(m_engine, m_capacity, flags);
     }
@@ -37,9 +35,5 @@ std::shared_ptr<ze_event> ze_event_pool_manager::create_event(uint64_t queue_sta
     };
     ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event));
 
-    return std::make_shared<ze_event>(this, event, queue_stamp, m_current_pool);
-}
-
-void ze_event_pool_manager::destroy_event(ze_event *event) {
-    zeEventDestroy(event->get());
+    return std::make_shared<ze_event>(queue_stamp, *this, event, m_current_pool);
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp
new file mode 100644
index 00000000000000..9ed0244446a6ee
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_base_event_factory.hpp"
+#include "ze_event_pool.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// Interface for creating l0 events using event pools
+struct ze_event_factory : public ze_base_event_factory {
+public:
+    ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255);
+    event::ptr create_event(uint64_t queue_stamp) override;
+protected:
+    std::shared_ptr<ze_event_pool> m_current_pool;
+    const uint32_t m_capacity;
+    uint32_t m_num_used;
+};
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
deleted file mode 100644
index c852fc295b885c..00000000000000
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <memory>
-
-#include "ze_engine.hpp"
-
-namespace cldnn {
-namespace ze {
-
-struct ze_event;
-
-// Interface for creating and destroying Level Zero events
-struct ze_event_manager {
-public:
-    using ptr = std::shared_ptr<ze_event_manager>;
-    ze_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling)
-    : m_engine(engine), m_cmd_list(cmd_list), m_enable_profiling(enable_profiling) {}
-    const ze_engine& get_engine() const { return m_engine; }
-    bool is_profiling_enabled() const { return m_enable_profiling; }
-    ze_command_list_handle_t get_cmd_list() { return m_cmd_list; }
-
-    virtual ~ze_event_manager() {}
-    virtual std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) = 0;
-    virtual void destroy_event(ze_event *event) = 0;
-protected:
-    const ze_engine& m_engine;
-    ze_command_list_handle_t m_cmd_list;
-    bool m_enable_profiling;
-};
-}  // namespace ze
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
index cec1448cf8be8d..98d5f485b645cb 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp
@@ -9,14 +9,13 @@
 namespace cldnn {
 namespace ze {
 
-struct ze_event;
-
-// Wrapper for ze events pool which is needed to track lifetime of the pool.
-// I.e. the object is destoyed if no ze_events alive which refer to this pool
-// and ze_events_pool doesn't refer to it as well
+// RAII wrapper for Level Zero event pool
 struct ze_event_pool {
     ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags);
     ~ze_event_pool();
+    ze_event_pool(const ze_event_pool&) = delete;
+    ze_event_pool& operator=(const ze_event_pool&) = delete;
+
     using ptr = std::shared_ptr<ze_event_pool>;
 
     ze_event_pool_handle_t m_handle;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
deleted file mode 100644
index 1cac8585e2ea19..00000000000000
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ze_event_manager.hpp"
-#include "ze_event_pool.hpp"
-
-namespace cldnn {
-namespace ze {
-
-// Interface for creating and destroying l0 events using event pools
-struct ze_event_pool_manager : public ze_event_manager {
-public:
-    ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity = 255);
-    ~ze_event_pool_manager();
-    std::shared_ptr<ze_event> create_event(uint64_t queue_stamp) override;
-    void destroy_event(ze_event *event) override;
-protected:
-    std::shared_ptr<ze_event_pool> m_current_pool;
-    const uint32_t m_capacity;
-    uint32_t m_num_used;
-};
-}  // namespace ze
-}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
new file mode 100644
index 00000000000000..1ac2d04a69757d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ze_events.hpp"
+#include "ze_common.hpp"
+
+#include <cassert>
+#include <chrono>
+#include <list>
+
+using namespace cldnn;
+using namespace ze;
+
+void ze_events::wait_impl() {
+    if (m_last_event) {
+        ZE_CHECK(zeEventHostSynchronize(m_last_event, default_timeout));
+    }
+}
+
+void ze_events::set_impl() {
+    // Call wait_impl to be in line with ocl_events
+    wait_impl();
+}
+
+bool ze_events::is_set_impl() {
+    if (!m_last_event) {
+        return true;
+    }
+
+    auto ret = zeEventQueryStatus(m_last_event);
+    switch (ret) {
+    case ZE_RESULT_SUCCESS:
+        return true;
+        break;
+    case ZE_RESULT_NOT_READY:
+        return false;
+        break;
+    default:
+        OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret));
+        break;
+    }
+}
+
+bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
+    // The goal is to sum up all disjoint durations of its projection on the time axis
+    std::vector<ze_kernel_timestamp_data_t> all_global_timestamps;
+    std::vector<ze_kernel_timestamp_data_t> all_context_timestamps;
+
+    auto add_or_merge = [](std::vector<ze_kernel_timestamp_data_t>& all_timestamps, const ze_kernel_timestamp_data_t& ts) {
+        auto it = all_timestamps.begin();
+        bool merged = false;
+        auto target_timestamp = ts;
+        while (it != all_timestamps.end()) {
+            auto& timestamp = *it;
+            bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd;
+            bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart;
+            if (!disjoint) {
+                if (equal) {
+                    if (!merged) {
+                        merged = true;
+                        break;
+                    } else {
+                        it = all_timestamps.erase(it);
+                    }
+                } else {
+                    if (!merged) {
+                        timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart);
+                        timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd);
+                        target_timestamp = timestamp;
+                        merged = true;
+                        it++;
+                    } else {
+                        if (timestamp.kernelEnd > target_timestamp.kernelEnd) {
+                            it--;
+                            it->kernelEnd = target_timestamp.kernelEnd;
+                            it++;
+                        }
+                        it = all_timestamps.erase(it);
+                    }
+                }
+            } else {
+                it++;
+            }
+        }
+
+        if (!merged) {
+            all_timestamps.push_back(target_timestamp);
+        }
+    };
+
+    if (m_events.empty())
+        return false;
+
+    auto device_info = m_engine.get_device_info();
+
+    auto get_total_exec_time = [&device_info](std::vector<ze_kernel_timestamp_data_t>& all_timestamps) {
+        std::chrono::nanoseconds total_time{0};
+        for (const auto& ts : all_timestamps) {
+            total_time += timestamp_to_duration(device_info, ts);
+        }
+
+        return total_time;
+    };
+
+    // Submission time is calculated as difference between merged context and wallclock intervals
+    // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals
+    using intervals_t = std::vector<ze_kernel_timestamp_data_t>;
+    auto get_submission_time = [&device_info](const intervals_t& s_timestamps,
+                                              const intervals_t& e_timestamps) {
+        auto get_minmax = [](const intervals_t& timestamps) {
+            uint64_t min_val = std::min(timestamps.begin(), timestamps.end(),
+                [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) {
+                    return lhs->kernelStart < rhs->kernelStart;
+            })->kernelStart;
+            uint64_t max_val = std::max(timestamps.begin(), timestamps.end(),
+                [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) {
+                    return lhs->kernelEnd < rhs->kernelEnd;
+            })->kernelEnd;
+
+            return ze_kernel_timestamp_data_t{min_val, max_val};
+        };
+
+        auto submission_interval = get_minmax(s_timestamps);
+        auto exec_interval = get_minmax(e_timestamps);
+
+        auto wallclock_time = timestamp_to_duration(device_info, submission_interval);
+        auto exec_time = timestamp_to_duration(device_info, exec_interval);
+
+        return wallclock_time - exec_time;
+    };
+
+    for (size_t i = 0; i < m_events.size(); i++) {
+        auto be = downcast<ze_base_event>(m_events[i].get());
+        auto opt_timestamp = be->query_timestamp();
+        if (!opt_timestamp.has_value()) {
+            continue;
+        }
+        ze_kernel_timestamp_result_t timestamp = opt_timestamp.value();
+
+        add_or_merge(all_global_timestamps, timestamp.global);
+        add_or_merge(all_context_timestamps, timestamp.context);
+    }
+
+    auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps);
+    auto exec_time = get_total_exec_time(all_context_timestamps);
+
+    auto period_exec = std::make_shared<instrumentation::profiling_period_basic>(exec_time);
+    auto period_submit = std::make_shared<instrumentation::profiling_period_basic>(submit_time);
+
+    info.push_back({ instrumentation::profiling_stage::executing, period_exec });
+    info.push_back({ instrumentation::profiling_stage::submission, period_submit });
+
+    return true;
+}
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
new file mode 100644
index 00000000000000..680753d6409084
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_base_event.hpp"
+#include "ze_engine.hpp"
+
+#include <vector>
+
+namespace cldnn {
+namespace ze {
+
+struct ze_events : public ze_base_event {
+public:
+    ze_events(std::vector<event::ptr> const& ev, const ze_engine &engine)
+        : ze_base_event(0)
+        , m_engine(engine) {
+        process_events(ev);
+    }
+
+    void reset() override {
+        event::reset();
+        m_events.clear();
+    }
+
+    std::optional<ze_kernel_timestamp_result_t> query_timestamp() override { return std::nullopt; }
+    ze_event_handle_t get_handle() const { return m_last_event; }
+    bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
+
+protected:
+    void wait_impl() override;
+    void set_impl() override;
+    bool is_set_impl() override;
+
+    void process_events(const std::vector<event::ptr>& ev) {
+        for (size_t i = 0; i < ev.size(); i++) {
+            auto multiple_events = dynamic_cast<ze_events*>(ev[i].get());
+            if (multiple_events) {
+                for (size_t j = 0; j < multiple_events->m_events.size(); j++) {
+                    if (auto base_ev = dynamic_cast<ze_base_event*>(multiple_events->m_events[j].get())) {
+                        auto current_ev_queue_stamp = base_ev->get_queue_stamp();
+                        if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) {
+                            m_queue_stamp = current_ev_queue_stamp;
+                            m_last_event = base_ev->get_handle();
+                        }
+                    }
+                    m_events.push_back(multiple_events->m_events[j]);
+                }
+            } else {
+                if (auto base_ev = dynamic_cast<ze_base_event*>(ev[i].get())) {
+                    auto current_ev_queue_stamp = base_ev->get_queue_stamp();
+                    if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) {
+                        m_queue_stamp = current_ev_queue_stamp;
+                        m_last_event = base_ev->get_handle();
+                    }
+                }
+                m_events.push_back(ev[i]);
+            }
+        }
+    }
+
+    ze_event_handle_t m_last_event = nullptr;
+    std::vector<event::ptr> m_events;
+    const ze_engine &m_engine;
+};
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 218c3390a5c566..55903ad3104b48 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -31,7 +31,7 @@ std::vector<ze_event_handle_t> get_ze_events(const std::vector<event::ptr>& even
     std::vector<ze_event_handle_t> ze_events;
     ze_events.reserve(events.size());
      for (const auto& ev : events) {
-        auto ze_event = downcast<ze::ze_base_event>(ev.get())->get();
+        auto ze_event = downcast<ze::ze_base_event>(ev.get())->get_handle();
         if (ze_event != nullptr) {
             ze_events.push_back(ze_event);
         }
@@ -142,7 +142,7 @@ void gpu_usm::unlock(const stream& /* stream */) {
 event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vector<event::ptr>& dep_events, bool blocking) {
     auto& _ze_stream = downcast<ze_stream>(stream);
     auto ev = _ze_stream.create_base_event();
-    auto ev_ze = downcast<ze::ze_base_event>(ev.get())->get();
+    auto ev_ze = downcast<ze::ze_base_event>(ev.get())->get_handle();
     std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
     auto ze_dep_events = get_ze_events(dep_events);
     ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(),
@@ -170,7 +170,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o
         return result_event;
 
     auto _ze_stream = downcast<ze_stream>(&stream);
-    auto _ze_event = downcast<ze_base_event>(result_event.get())->get();
+    auto _ze_event = downcast<ze_base_event>(result_event.get())->get_handle();
     auto src_ptr = reinterpret_cast<const char*>(data_ptr) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
@@ -195,7 +195,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_
         return result_event;
 
     auto _ze_stream = downcast<ze_stream>(&stream);
-    auto _ze_event = downcast<ze_base_event>(result_event.get())->get();
+    auto _ze_event = downcast<ze_base_event>(result_event.get())->get_handle();
     OPENVINO_ASSERT(memory_capabilities::is_usm_type(src_mem.get_allocation_type()));
 
     auto usm_mem = downcast<const gpu_usm>(&src_mem);
@@ -222,7 +222,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s
         return result_event;
 
     auto _ze_stream = downcast<ze_stream>(&stream);
-    auto _ze_event = downcast<ze_base_event>(result_event.get())->get();
+    auto _ze_event = downcast<ze_base_event>(result_event.get())->get_handle();
     auto src_ptr = reinterpret_cast<const char*>(buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(data_ptr) + dst_offset;
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 374f5dbb342985..76968653ea7e81 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -9,9 +9,10 @@
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/properties.hpp"
 
-#include "ze_event_manager.hpp"
-#include "ze_cb_event_manager.hpp"
-#include "ze_event_pool_manager.hpp"
+#include "ze_cb_event_factory.hpp"
+#include "ze_event_factory.hpp"
+#include "ze_events.hpp"
+#include "ze_empty_event.hpp"
 
 #include "ze_event.hpp"
 #include "ze_kernel.hpp"
@@ -204,16 +205,18 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
     command_queue_desc.ordinal = info.copy_queue_group_ordinal;
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list));
-    if (m_queue_type == QueueTypes::in_order) {
-        m_ev_manager = std::make_unique<ze_cb_event_manager>(engine, m_command_list, config.get_enable_profiling());
+    if (false/*m_queue_type == QueueTypes::in_order && info.supports_cb_events*/) {
+        m_ev_factory = std::make_unique<ze_cb_event_factory>(engine, config.get_enable_profiling());
     } else {
-        m_ev_manager = std::make_unique<ze_event_pool_manager>(engine, m_command_list, config.get_enable_profiling());
+        m_ev_factory = std::make_unique<ze_event_factory>(engine, config.get_enable_profiling());
     }
 }
 
 ze_stream::~ze_stream() {
+#ifdef ENABLE_ONEDNN_FOR_GPU
     // Destroy OneDNN stream before destroying command list
     _onednn_stream.reset();
+#endif
     if (m_command_list != nullptr)
         zeCommandListDestroy(m_command_list);
     if (m_copy_command_list != nullptr)
@@ -243,8 +246,8 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel,
     if (m_sync_method == SyncMethods::events) {
         for (auto& dep : deps) {
             if (auto ze_base_ev = std::dynamic_pointer_cast<ze_base_event>(dep)) {
-                if (ze_base_ev->get() != nullptr)
-                    dep_events.push_back(ze_base_ev->get());
+                if (ze_base_ev->get_handle() != nullptr)
+                    dep_events.push_back(ze_base_ev->get_handle());
             }
         }
         dep_events_ptr = &dep_events;
@@ -253,7 +256,7 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel,
     }
     bool set_output_event = m_sync_method == SyncMethods::events || is_output;
 
-    auto ev = set_output_event ? create_base_event() : std::make_shared<ze_event>(nullptr, nullptr, ++m_queue_counter);
+    auto ev = set_output_event ? create_base_event() : std::make_shared<ze_empty_event>(++m_queue_counter);
     auto global = to_group_count(args_desc.workGroups.global);
     auto local = to_group_count(args_desc.workGroups.local);
     ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ };
@@ -261,7 +264,7 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel,
     ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list,
                                              kern,
                                              &args,
-                                             set_output_event ? std::dynamic_pointer_cast<ze_base_event>(ev)->get() : nullptr,
+                                             set_output_event ? std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle() : nullptr,
                                              dep_events_ptr == nullptr ? 0 : static_cast<uint32_t>(dep_events_ptr->size()),
                                              dep_events_ptr == nullptr ? 0 : &dep_events_ptr->front()));
 
@@ -275,7 +278,7 @@ void ze_stream::enqueue_barrier() {
 event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, bool is_output) {
     if (deps.empty()) {
         auto ev = create_base_event();
-        ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast<ze_base_event>(ev)->get(), 0, nullptr));
+        ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle(), 0, nullptr));
         return ev;
     }
 
@@ -283,8 +286,8 @@ event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, boo
         std::vector<ze_event_handle_t> dep_events;
         for (auto& dep : deps) {
             if (auto ze_base_ev = std::dynamic_pointer_cast<ze_base_event>(dep)) {
-                if (ze_base_ev->get() != nullptr)
-                    dep_events.push_back(ze_base_ev->get());
+                if (ze_base_ev->get_handle() != nullptr)
+                    dep_events.push_back(ze_base_ev->get_handle());
             }
         }
         if (dep_events.empty())
@@ -292,7 +295,7 @@ event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, boo
 
         auto ev = create_base_event();
         ZE_CHECK(zeCommandListAppendBarrier(m_command_list,
-                                            std::dynamic_pointer_cast<ze_base_event>(ev)->get(),
+                                            std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle(),
                                             static_cast<uint32_t>(dep_events.size()),
                                             &dep_events.front()));
         return ev;
@@ -305,8 +308,8 @@ event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, boo
     }
 }
 
-ze_event::ptr ze_stream::group_events(std::vector<ze_events::ptr> const& deps) {
-    return std::make_shared<ze_events>(deps);
+ze_event::ptr ze_stream::group_events(std::vector<ze_event::ptr> const& deps) {
+    return std::make_shared<ze_events>(deps, _engine);
 }
 
 void ze_stream::wait() {
@@ -314,8 +317,7 @@ void ze_stream::wait() {
 }
 
 event::ptr ze_stream::create_user_event(bool set) {
-    // user event should use different api
-    auto ev = m_ev_manager->create_event(++m_queue_counter);
+    auto ev = m_ev_factory->create_event(++m_queue_counter);
     if (set)
         ev->set();
 
@@ -323,7 +325,7 @@ event::ptr ze_stream::create_user_event(bool set) {
 }
 
 event::ptr ze_stream::create_base_event() {
-    return m_ev_manager->create_event(++m_queue_counter);
+    return m_ev_factory->create_event(++m_queue_counter);
 }
 
 void ze_stream::flush() const {
@@ -338,7 +340,7 @@ void ze_stream::wait_for_events(const std::vector<event::ptr>& events) {
     bool needs_sync = false;
     for (auto& ev : events) {
         auto* ze_base_ev = dynamic_cast<ze_base_event*>(ev.get());
-        if (ze_base_ev->get() != nullptr) {
+        if (ze_base_ev->get_handle() != nullptr) {
             ze_base_ev->wait();
         } else {
             needs_sync = true;
@@ -366,7 +368,7 @@ void ze_stream::sync_events(std::vector<event::ptr> const& deps, bool is_output)
         if (is_output) {
             m_last_barrier_ev = std::dynamic_pointer_cast<ze_event>(create_base_event());
             m_last_barrier_ev->set_queue_stamp(m_queue_counter.load());
-            ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get(), 0, nullptr));
+            ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr));
         } else {
             ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
         }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
index 2888ab696cec0d..68e481c5bf3018 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -9,7 +9,7 @@
 #include "ze_common.hpp"
 #include "ze_engine.hpp"
 #include "ze_event.hpp"
-#include "ze_event_manager.hpp"
+#include "ze_base_event_factory.hpp"
 
 namespace cldnn {
 namespace ze {
@@ -28,7 +28,7 @@ class ze_stream : public stream {
         , m_queue_counter(other.m_queue_counter.load())
         , m_last_barrier(other.m_last_barrier.load())
         , m_last_barrier_ev(other.m_last_barrier_ev)
-        , m_ev_manager(other.m_ev_manager.release()) {
+        , m_ev_factory(other.m_ev_factory.release()) {
             other.m_command_list = nullptr;
             other.m_copy_command_list = nullptr;
         }
@@ -65,7 +65,7 @@ class ze_stream : public stream {
     mutable std::atomic<uint64_t> m_queue_counter{0};
     std::atomic<uint64_t> m_last_barrier{0};
     std::shared_ptr<ze_event> m_last_barrier_ev = nullptr;
-    std::unique_ptr<ze_event_manager> m_ev_manager;
+    std::unique_ptr<ze_base_event_factory> m_ev_factory;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     std::shared_ptr<dnnl::stream> _onednn_stream = nullptr;

From e8dce1572c7b5fedd5d4b9ce3f6dc5f08a82729d Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 9 Oct 2025 12:05:12 +0000
Subject: [PATCH 28/74] Enable cb events for in-order queue type

---
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 76968653ea7e81..1194ef8565781b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -205,7 +205,7 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
     command_queue_desc.ordinal = info.copy_queue_group_ordinal;
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list));
-    if (false/*m_queue_type == QueueTypes::in_order && info.supports_cb_events*/) {
+    if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) {
         m_ev_factory = std::make_unique<ze_cb_event_factory>(engine, config.get_enable_profiling());
     } else {
         m_ev_factory = std::make_unique<ze_event_factory>(engine, config.get_enable_profiling());

From ae161777576be8bc3dc35212cc22724cbc380428 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 9 Oct 2025 15:41:15 +0000
Subject: [PATCH 29/74] Restore pooling impl and convolution ref impl

---
 src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp   | 5 ++---
 .../kernels/convolution/convolution_kernel_ref.cpp           | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp
index 81c4c27189cb83..8a308d505cf4b1 100644
--- a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp
+++ b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp
@@ -16,8 +16,7 @@ using namespace cldnn;
 
 const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<pooling>::get_implementations() {
     static const std::vector<std::shared_ptr<ImplementationManager>> impls = {
-        //FIXME: Disable for now as there is some issue when creating OneDNN descriptor - returns unimplemented
-        /*OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) {
+        OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) {
             const auto& in_layout = node.get_input_layout(0);
             const auto& out_layout = node.get_output_layout(0);
             // Disable this case due to sporadic hang for the following case:
@@ -28,7 +27,7 @@ const std::vector<std::shared_ptr<cldnn::ImplementationManager>>& Registry<pooli
             if (in_layout.format == format::byxf && out_layout.format == format::bfyx && ov::element::Type(in_layout.data_type).is_integral_number())
                 return false;
             return true;
-        })*/
+        })
         OV_GPU_GET_INSTANCE_OCL(pooling, shape_types::static_shape)
     };
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp
index 3f963f6fa8c3df..319b733f016a0e 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp
@@ -109,7 +109,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_Ref::SetDefault(const conv
 }
 
 KernelsPriority ConvolutionKernel_Ref::GetKernelsPriority(const Params& /*params*/) const {
-    return FORCE_PRIORITY_1;
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
 }
 
 bool ConvolutionKernel_Ref::Validate(const Params& params) const {

From 8a7b56659325a59cd272c27d0c862840e1b99a64 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 10 Oct 2025 10:05:22 +0000
Subject: [PATCH 30/74] Fix OneDNN include paths

---
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 0f0a77d83b24e1..0faac9003405b6 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -170,7 +170,6 @@ if(ENABLE_ONEDNN_FOR_GPU)
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen"
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config"
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include"
-                             "${ONEDNN_GPU_DIR}/third_party"
                              "${ONEDNN_GPU_DIR}/third_party/ngen")
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
                             DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE}

From 0e7c3638e653e67460727e69fab5f8ef08e9f0a3 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 10 Oct 2025 13:34:28 +0000
Subject: [PATCH 31/74] Move compute runtime headers to separate target

---
 src/plugins/intel_gpu/cmake/utils.cmake                    | 2 +-
 thirdparty/level_zero/CMakeLists.txt                       | 5 ++++-
 thirdparty/level_zero/{ => compute-runtime}/ze_intel_gpu.h | 0
 thirdparty/level_zero/{ => compute-runtime}/ze_stypes.h    | 0
 thirdparty/level_zero/{ => compute-runtime}/zex_common.h   | 0
 thirdparty/level_zero/{ => compute-runtime}/zex_event.h    | 0
 6 files changed, 5 insertions(+), 2 deletions(-)
 rename thirdparty/level_zero/{ => compute-runtime}/ze_intel_gpu.h (100%)
 rename thirdparty/level_zero/{ => compute-runtime}/ze_stypes.h (100%)
 rename thirdparty/level_zero/{ => compute-runtime}/zex_common.h (100%)
 rename thirdparty/level_zero/{ => compute-runtime}/zex_event.h (100%)

diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake
index 5b61368a5d4742..e407fd9b3f04fb 100644
--- a/src/plugins/intel_gpu/cmake/utils.cmake
+++ b/src/plugins/intel_gpu/cmake/utils.cmake
@@ -5,7 +5,7 @@
 function(ov_gpu_set_runtime_interface_for TARGET_NAME)
     if(GPU_RT_TYPE STREQUAL "L0")
         target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1)
-        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
+        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero ze_compute_runtime_headers)
     elseif(GPU_RT_TYPE STREQUAL "OCL")
         target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
         # Do not link OpenCL as It is already linked to the targets that require it
diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt
index 87b76c339a5233..416e7f48005400 100644
--- a/thirdparty/level_zero/CMakeLists.txt
+++ b/thirdparty/level_zero/CMakeLists.txt
@@ -30,7 +30,10 @@ endif()
 set(CMAKE_COMPILE_WARNING_AS_ERROR OFF)
 add_subdirectory(level-zero EXCLUDE_FROM_ALL)
 
-set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include ${CMAKE_CURRENT_SOURCE_DIR}>)
+set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include>)
+
+add_library(ze_compute_runtime_headers INTERFACE)
+target_include_directories(ze_compute_runtime_headers INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime>)
 
 # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it
 file(REMOVE "${CMAKE_BINARY_DIR}/VERSION")
diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/compute-runtime/ze_intel_gpu.h
similarity index 100%
rename from thirdparty/level_zero/ze_intel_gpu.h
rename to thirdparty/level_zero/compute-runtime/ze_intel_gpu.h
diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/compute-runtime/ze_stypes.h
similarity index 100%
rename from thirdparty/level_zero/ze_stypes.h
rename to thirdparty/level_zero/compute-runtime/ze_stypes.h
diff --git a/thirdparty/level_zero/zex_common.h b/thirdparty/level_zero/compute-runtime/zex_common.h
similarity index 100%
rename from thirdparty/level_zero/zex_common.h
rename to thirdparty/level_zero/compute-runtime/zex_common.h
diff --git a/thirdparty/level_zero/zex_event.h b/thirdparty/level_zero/compute-runtime/zex_event.h
similarity index 100%
rename from thirdparty/level_zero/zex_event.h
rename to thirdparty/level_zero/compute-runtime/zex_event.h

From 7b5829fe93e64a3d89ae1c27bb74d237afc5ef1f Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 13 Oct 2025 15:27:14 +0000
Subject: [PATCH 32/74] Remove redundant DNNL macro

---
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 0faac9003405b6..d876d52f97baed 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -160,11 +160,6 @@ if(ENABLE_ONEDNN_FOR_GPU)
                 DEPENDEES install  # Ensures this runs after install
             )
         endif()
-        if(GPU_RUNTIME STREQUAL "L0")
-            set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_L0)
-        elseif(GPU_RUNTIME STREQUAL "OCL")
-            set(DNNL_GPU_RUNTIME_VALUE  DNNL_RUNTIME_OCL)
-        endif()
         set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include"
                              "${ONEDNN_GPU_DIR}/src"
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen"
@@ -172,7 +167,6 @@ if(ENABLE_ONEDNN_FOR_GPU)
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include"
                              "${ONEDNN_GPU_DIR}/third_party/ngen")
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
-                            DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE}
                             DNNL_DLL
                             DNNL_DLL_EXPORTS
                             DNNL_ENABLE_CPU_ISA_HINTS

From debf47f265e0e44ede1b988fce55be80baeee02b Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 15 Oct 2025 11:31:26 +0000
Subject: [PATCH 33/74] Add copy offload

---
 .../include/intel_gpu/runtime/device_info.hpp        |  1 -
 src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp   |  5 -----
 src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp   |  8 ++++----
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp   | 12 ++++++++----
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp   |  4 ----
 5 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index 48ca01cb64f1cf..bf6388e9730898 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -147,7 +147,6 @@ struct device_info {
     uint64_t timer_resolution;                  ///< [L0] Resolution of device timer used for profiling in cycles/sec
     uint32_t kernel_timestamp_valid_bits;       ///< [L0] Number of valid bits in the kernel timestamp values
     uint32_t compute_queue_group_ordinal;       ///< [L0] Ordinal of the command queue group to use for compute
-    uint32_t copy_queue_group_ordinal;          ///< [L0] Ordinal of the command queue group to use for copy
     uint32_t device_memory_ordinal;             ///< [L0] Ordinal of the selected global device memory
 
     ov::device::UUID uuid;                      ///< UUID of the gpu device
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index 0300ba9ff93f5b..d889371b8ecfb1 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -103,12 +103,8 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) {
         return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0;
     });
-    auto copy_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) {
-        return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0 && (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0;
-    });
 
     OPENVINO_ASSERT(compute_queue_props != queue_properties.end());
-    OPENVINO_ASSERT(copy_queue_props != queue_properties.end());
 
     uint32_t memory_properties_count = 0;
     ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr));
@@ -200,7 +196,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.kernel_timestamp_valid_bits  = device_properties.kernelTimestampValidBits;
     info.timer_resolution  = device_properties.timerResolution;
     info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props);
-    info.copy_queue_group_ordinal = std::distance(queue_properties.begin(), copy_queue_props);
 
     static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, "");
     static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, "");
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 55903ad3104b48..3411c5489361a8 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -111,7 +111,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re
             }
             GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
             _host_buffer.allocateHost(_bytes_count);
-            ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_copy_queue(),
+            ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(),
                                     _host_buffer.get(),
                                     _buffer.get(),
                                     _bytes_count,
@@ -174,7 +174,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o
     auto src_ptr = reinterpret_cast<const char*>(data_ptr) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(),
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
@@ -202,7 +202,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_
     auto src_ptr = reinterpret_cast<const char*>(usm_mem->buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(),
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
@@ -226,7 +226,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s
     auto src_ptr = reinterpret_cast<const char*>(buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(data_ptr) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(),
+    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 1194ef8565781b..aa3f90dbf169dd 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -202,9 +202,15 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
     command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
 
+    zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {};
+    cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES;
+    cp_offload_desc.copyOffloadEnabled = true;
+    cp_offload_desc.pNext = nullptr;
+    if (info.supports_cp_offload) {
+        command_queue_desc.pNext = &cp_offload_desc;
+    }
+
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
-    command_queue_desc.ordinal = info.copy_queue_group_ordinal;
-    ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list));
     if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) {
         m_ev_factory = std::make_unique<ze_cb_event_factory>(engine, config.get_enable_profiling());
     } else {
@@ -219,8 +225,6 @@ ze_stream::~ze_stream() {
 #endif
     if (m_command_list != nullptr)
         zeCommandListDestroy(m_command_list);
-    if (m_copy_command_list != nullptr)
-        zeCommandListDestroy(m_copy_command_list);
 }
 
 void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
index 68e481c5bf3018..03e9c12a578a9a 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -17,20 +17,17 @@ namespace ze {
 class ze_stream : public stream {
 public:
     ze_command_list_handle_t get_queue() const { return m_command_list; }
-    ze_command_list_handle_t get_copy_queue() const { return m_copy_command_list; }
 
     ze_stream(const ze_engine& engine, const ExecutionConfig& config);
     ze_stream(ze_stream&& other)
         : stream(other.m_queue_type, other.m_sync_method)
         , _engine(other._engine)
         , m_command_list(other.m_command_list)
-        , m_copy_command_list(other.m_copy_command_list)
         , m_queue_counter(other.m_queue_counter.load())
         , m_last_barrier(other.m_last_barrier.load())
         , m_last_barrier_ev(other.m_last_barrier_ev)
         , m_ev_factory(other.m_ev_factory.release()) {
             other.m_command_list = nullptr;
-            other.m_copy_command_list = nullptr;
         }
 
     ~ze_stream();
@@ -61,7 +58,6 @@ class ze_stream : public stream {
 
     const ze_engine& _engine;
     mutable ze_command_list_handle_t m_command_list = 0;
-    mutable ze_command_list_handle_t m_copy_command_list = 0;
     mutable std::atomic<uint64_t> m_queue_counter{0};
     std::atomic<uint64_t> m_last_barrier{0};
     std::shared_ptr<ze_event> m_last_barrier_ev = nullptr;

From 33a173683e8565be5a01d4a67a74edcb60c2c26f Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 15 Oct 2025 11:49:42 +0000
Subject: [PATCH 34/74] warn if copy offload is not supported

---
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index aa3f90dbf169dd..08bfea5e15f343 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -208,6 +208,8 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     cp_offload_desc.pNext = nullptr;
     if (info.supports_cp_offload) {
         command_queue_desc.pNext = &cp_offload_desc;
+    } else {
+        GPU_DEBUG_INFO << "Copy offload hint is not supported" << std::endl;
     }
 
     ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));

From 37cfa3975aaeb7f2c15ddcd1dd72b893de4b5908 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 15 Oct 2025 13:46:34 +0000
Subject: [PATCH 35/74] Copy level zero headers to build dir

---
 src/plugins/intel_gpu/cmake/utils.cmake |  2 +-
 thirdparty/level_zero/CMakeLists.txt    | 20 +++++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake
index e407fd9b3f04fb..5b61368a5d4742 100644
--- a/src/plugins/intel_gpu/cmake/utils.cmake
+++ b/src/plugins/intel_gpu/cmake/utils.cmake
@@ -5,7 +5,7 @@
 function(ov_gpu_set_runtime_interface_for TARGET_NAME)
     if(GPU_RT_TYPE STREQUAL "L0")
         target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1)
-        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero ze_compute_runtime_headers)
+        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
     elseif(GPU_RT_TYPE STREQUAL "OCL")
         target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
         # Do not link OpenCL as It is already linked to the targets that require it
diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt
index 416e7f48005400..0fbbd2ad303106 100644
--- a/thirdparty/level_zero/CMakeLists.txt
+++ b/thirdparty/level_zero/CMakeLists.txt
@@ -30,10 +30,24 @@ endif()
 set(CMAKE_COMPILE_WARNING_AS_ERROR OFF)
 add_subdirectory(level-zero EXCLUDE_FROM_ALL)
 
-set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include>)
+set(ZE_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/include/")
+file(GLOB_RECURSE COMPUTE_RUNTIME_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime/*.h")
+file(GLOB_RECURSE LEVEL_ZERO_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include")
+add_custom_command(OUTPUT "${ZE_INCLUDE_DIR}"
+    COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include" "${ZE_INCLUDE_DIR}/level_zero"
+    COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime" "${ZE_INCLUDE_DIR}/level_zero"
+    DEPENDS "${COMPUTE_RUNTIME_HEADERS}" "${LEVEL_ZERO_HEADERS}"
+    COMMENT "Copying Level Zero and compute-runtime headers..."
+)
+add_custom_target(prepare_ze_headers ALL DEPENDS "${ZE_INCLUDE_DIR}")
+add_dependencies(ze_loader prepare_ze_headers)
+
+# Allow include patterns with and without level-zero/ prefix
+set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    $<BUILD_INTERFACE:${ZE_INCLUDE_DIR}>
+    $<BUILD_INTERFACE:${ZE_INCLUDE_DIR}/level_zero>
+)
 
-add_library(ze_compute_runtime_headers INTERFACE)
-target_include_directories(ze_compute_runtime_headers INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime>)
 
 # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it
 file(REMOVE "${CMAKE_BINARY_DIR}/VERSION")

From 55cb68e936c11f2dca1a5bd9b8c4e7bfc2e40639 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 17 Oct 2025 15:34:50 +0000
Subject: [PATCH 36/74] Update l0 onednn submodule

---
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
index ce0e98bf72a6c7..633a03d736a265 160000
--- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -1 +1 @@
-Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f
+Subproject commit 633a03d736a2656eab1de8d7e7032711bdc4a30b

From 4893a284dfba619a81d6f554bf5b4484c3e5e0b3 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 17 Oct 2025 15:54:19 +0000
Subject: [PATCH 37/74] Change L0 macro check names

---
 .../intel_gpu/src/runtime/ze/ze_cb_event.cpp  |  6 ++--
 .../src/runtime/ze/ze_cb_event_factory.cpp    |  4 +--
 .../intel_gpu/src/runtime/ze/ze_common.hpp    |  6 ++--
 .../intel_gpu/src/runtime/ze/ze_device.cpp    | 30 +++++++++----------
 .../src/runtime/ze/ze_device_detector.cpp     | 16 +++++-----
 .../intel_gpu/src/runtime/ze/ze_event.cpp     | 10 +++----
 .../src/runtime/ze/ze_event_factory.cpp       |  2 +-
 .../src/runtime/ze/ze_event_pool.cpp          |  2 +-
 .../intel_gpu/src/runtime/ze/ze_events.cpp    |  2 +-
 .../intel_gpu/src/runtime/ze/ze_kernel.hpp    |  6 ++--
 .../intel_gpu/src/runtime/ze/ze_memory.cpp    | 14 ++++-----
 .../intel_gpu/src/runtime/ze/ze_memory.hpp    |  6 ++--
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    | 18 +++++------
 13 files changed, 62 insertions(+), 60 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
index c7f71f00a394dc..3746c92fe00bbc 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
@@ -13,7 +13,7 @@ using namespace cldnn;
 using namespace ze;
 
 void ze_cb_event::wait_impl() {
-    ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout));
+    OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout));
 }
 
 void ze_cb_event::set_impl() {
@@ -44,7 +44,7 @@ std::optional<ze_kernel_timestamp_result_t> ze_cb_event::query_timestamp() {
         return std::nullopt;
     }
     ze_kernel_timestamp_result_t timestamp{};
-    ZE_CHECK(zeEventQueryKernelTimestamp(m_event, &timestamp));
+    OV_ZE_EXPECT(zeEventQueryKernelTimestamp(m_event, &timestamp));
     return timestamp;
 }
 
@@ -68,5 +68,5 @@ bool ze_cb_event::get_profiling_info_impl(std::list<instrumentation::profiling_i
 }
 
 ze_cb_event::~ze_cb_event() {
-    ZE_WARN(zeEventDestroy(m_event));
+    OV_ZE_WARN(zeEventDestroy(m_event));
 }
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
index d42727f6584a9d..1a7eaa4445a6fe 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
@@ -13,7 +13,7 @@ using namespace ze;
 namespace {
     decltype(zexCounterBasedEventCreate2) *func_zexCounterBasedEventCreate2 = nullptr;
     void find_function_address(ze_driver_handle_t driver) {
-        ZE_CHECK(zeDriverGetExtensionFunctionAddress(driver,
+        OV_ZE_EXPECT(zeDriverGetExtensionFunctionAddress(driver,
                                                 "zexCounterBasedEventCreate2",
                                                 reinterpret_cast<void **>(&func_zexCounterBasedEventCreate2)));
     }
@@ -32,7 +32,7 @@ event::ptr ze_cb_event_factory::create_event(uint64_t queue_stamp) {
     if (is_profiling_enabled()) {
         desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP;
     }
-    ZE_CHECK(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event));
+    OV_ZE_EXPECT(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event));
     auto cb_event = std::make_shared<ze_cb_event>(queue_stamp, *this, event);
     return cb_event;
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
index 36d820139be816..3961bd5776a11e 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
@@ -7,7 +7,8 @@
 
 #include <limits>
 
-#define ZE_CHECK(f) \
+// Expect success of level zero command, throw runtime error otherwise
+#define OV_ZE_EXPECT(f) \
     do { \
         ze_result_t res_ = (f); \
         if (res_ != ZE_RESULT_SUCCESS) { \
@@ -15,7 +16,8 @@
         } \
     } while (false)
 
-#define ZE_WARN(f) \
+// Prints warning if level zero command does not return success result
+#define OV_ZE_WARN(f) \
     do { \
         ze_result_t res_ = (f); \
         if (res_ != ZE_RESULT_SUCCESS) { \
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index d889371b8ecfb1..d3557f9f30a914 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -65,13 +65,13 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     device_info info;
 
     uint32_t num_ext = 0;
-    ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, nullptr));
+    OV_ZE_EXPECT(zeDriverGetExtensionProperties(driver, &num_ext, nullptr));
 
     std::vector<ze_driver_extension_properties_t> extensions(num_ext);
-    ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0]));
+    OV_ZE_EXPECT(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0]));
 
     ze_driver_properties_t driver_properties{ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES};
-    ZE_CHECK(zeDriverGetProperties(driver, &driver_properties));
+    OV_ZE_EXPECT(zeDriverGetProperties(driver, &driver_properties));
 
     bool supports_luid = supports_extension(extensions, ZE_DEVICE_LUID_EXT_NAME, ZE_DEVICE_LUID_EXT_VERSION_1_0);
     bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0);
@@ -85,20 +85,20 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0};
     ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr};
-    ZE_CHECK(zeDeviceGetProperties(device, &device_properties));
+    OV_ZE_EXPECT(zeDeviceGetProperties(device, &device_properties));
 
     ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES};
-    ZE_CHECK(zeDeviceGetComputeProperties(device, &device_compute_properties));
+    OV_ZE_EXPECT(zeDeviceGetComputeProperties(device, &device_compute_properties));
 
     uint32_t queue_properties_count = 0;
-    ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr));
+    OV_ZE_EXPECT(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr));
 
     std::vector<ze_command_queue_group_properties_t> queue_properties(queue_properties_count);
     for (auto& mp : queue_properties) {
         mp.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
     }
 
-    ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0]));
+    OV_ZE_EXPECT(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0]));
 
     auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) {
         return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0;
@@ -107,16 +107,16 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     OPENVINO_ASSERT(compute_queue_props != queue_properties.end());
 
     uint32_t memory_properties_count = 0;
-    ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr));
+    OV_ZE_EXPECT(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr));
 
     std::vector<ze_device_memory_properties_t> device_memory_properties(memory_properties_count);
     for (auto& mp : device_memory_properties) {
         mp.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
     }
-    ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0]));
+    OV_ZE_EXPECT(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0]));
 
     ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES};
-    ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties));
+    OV_ZE_EXPECT(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties));
 
     auto mem_properties = std::find_if(device_memory_properties.begin(), device_memory_properties.end(), [](const ze_device_memory_properties_t& p) {
         auto name = std::string(p.name);
@@ -128,10 +128,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     if (supports_dp_properties) {
         device_module_properties.pNext = &dp_properties;
     }
-    ZE_CHECK(zeDeviceGetModuleProperties(device, &device_module_properties));
+    OV_ZE_EXPECT(zeDeviceGetModuleProperties(device, &device_module_properties));
 
     ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES};
-    ZE_CHECK(zeDeviceGetImageProperties(device, &device_image_properties));
+    OV_ZE_EXPECT(zeDeviceGetImageProperties(device, &device_image_properties));
 
     info.vendor_id = device_properties.vendorId;
     info.dev_name = device_properties.name;
@@ -240,7 +240,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     // Create temporary context just for OneDNN HW detection
     ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 };
     ze_context_handle_t context;
-    ZE_CHECK(zeContextCreate(driver, &context_desc, &context));
+    OV_ZE_EXPECT(zeContextCreate(driver, &context_desc, &context));
     ngen::Product product = ngen::LevelZeroCodeGenerator<ngen::HW::Unknown>::detectHWInfo(context, device);
     zeContextDestroy(context);
     info.arch = convert_ngen_arch(ngen::getCore(product.family));
@@ -259,7 +259,7 @@ memory_capabilities init_memory_caps(ze_device_handle_t device, const device_inf
     std::vector<allocation_type> memory_caps;
 
     ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES};
-    ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties));
+    OV_ZE_EXPECT(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties));
 
     if (info.supports_usm) {
         if (device_memory_access_properties.hostAllocCapabilities) {
@@ -294,7 +294,7 @@ void ze_device::initialize() {
         return;
 
     ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 };
-    ZE_CHECK(zeContextCreate(_driver, &context_desc, &_context));
+    OV_ZE_EXPECT(zeContextCreate(_driver, &context_desc, &_context));
     _is_initialized = true;
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp
index 4fede32a322a73..a40f4a7b690185 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp
@@ -16,13 +16,13 @@ namespace ze {
 
 static std::vector<ze_device_handle_t> get_sub_devices(ze_device_handle_t root_device) {
     uint32_t n_subdevices = 0;
-    ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr));
+    OV_ZE_EXPECT(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr));
     if (n_subdevices == 0)
         return {};
 
     std::vector<ze_device_handle_t> subdevices(n_subdevices);
 
-    ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0]));
+    OV_ZE_EXPECT(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0]));
 
     return subdevices;
 }
@@ -72,25 +72,25 @@ std::map<std::string, device::ptr> ze_device_detector::get_available_devices(voi
 std::vector<device::ptr> ze_device_detector::create_device_list(bool initialize_devices) const {
     std::vector<device::ptr> ret;
 
-    ZE_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY));
+    OV_ZE_EXPECT(zeInit(ZE_INIT_FLAG_GPU_ONLY));
 
     uint32_t driver_count = 0;
-    ZE_CHECK(zeDriverGet(&driver_count, nullptr));
+    OV_ZE_EXPECT(zeDriverGet(&driver_count, nullptr));
 
     std::vector<ze_driver_handle_t> all_drivers(driver_count);
-    ZE_CHECK(zeDriverGet(&driver_count, &all_drivers[0]));
+    OV_ZE_EXPECT(zeDriverGet(&driver_count, &all_drivers[0]));
 
     for (uint32_t i = 0; i < driver_count; ++i) {
         uint32_t device_count = 0;
-        ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, nullptr));
+        OV_ZE_EXPECT(zeDeviceGet(all_drivers[i], &device_count, nullptr));
 
         std::vector<ze_device_handle_t> all_devices(device_count);
-        ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0]));
+        OV_ZE_EXPECT(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0]));
 
         for (uint32_t d = 0; d < device_count; ++d) {
             try {
                 ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
-                ZE_CHECK(zeDeviceGetProperties(all_devices[d], &device_properties));
+                OV_ZE_EXPECT(zeDeviceGetProperties(all_devices[d], &device_properties));
 
                 if (ZE_DEVICE_TYPE_GPU == device_properties.type) {
                     ret.emplace_back(std::make_shared<ze_device>(all_drivers[i], all_devices[d], initialize_devices));
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
index 35069b0692904f..7d99b06aa619b6 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
@@ -14,15 +14,15 @@ using namespace ze;
 
 void ze_event::reset() {
     event::reset();
-    ZE_CHECK(zeEventHostReset(m_event));
+    OV_ZE_EXPECT(zeEventHostReset(m_event));
 }
 
 void ze_event::wait_impl() {
-    ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout));
+    OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout));
 }
 
 void ze_event::set_impl() {
-    ZE_CHECK(zeEventHostSignal(m_event));
+    OV_ZE_EXPECT(zeEventHostSignal(m_event));
 }
 
 bool ze_event::is_set_impl() {
@@ -45,7 +45,7 @@ std::optional<ze_kernel_timestamp_result_t> ze_event::query_timestamp() {
         return std::nullopt;
     }
     ze_kernel_timestamp_result_t timestamp{};
-    ZE_CHECK(zeEventQueryKernelTimestamp(m_event, &timestamp));
+    OV_ZE_EXPECT(zeEventQueryKernelTimestamp(m_event, &timestamp));
     return timestamp;
 }
 
@@ -73,5 +73,5 @@ bool ze_event::get_profiling_info_impl(std::list<instrumentation::profiling_inte
 }
 
 ze_event::~ze_event() {
-    ZE_WARN(zeEventDestroy(m_event));
+    OV_ZE_WARN(zeEventDestroy(m_event));
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
index 6ec7fc4e8ab3c4..797b8ef9679651 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
@@ -33,7 +33,7 @@ event::ptr ze_event_factory::create_event(uint64_t queue_stamp) {
         ZE_EVENT_SCOPE_FLAG_HOST,
         0
     };
-    ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event));
+    OV_ZE_EXPECT(zeEventCreate(m_current_pool->m_handle, &event_desc, &event));
 
     return std::make_shared<ze_event>(queue_stamp, *this, event, m_current_pool);
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
index c83179bf6d720d..74e040c6831dda 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp
@@ -18,7 +18,7 @@ ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_even
         capacity
     };
     auto device = engine.get_device();
-    ZE_CHECK(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle));
+    OV_ZE_EXPECT(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle));
 }
 
 ze_event_pool::~ze_event_pool() {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
index 1ac2d04a69757d..e54348e7b12c25 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
@@ -14,7 +14,7 @@ using namespace ze;
 
 void ze_events::wait_impl() {
     if (m_last_event) {
-        ZE_CHECK(zeEventHostSynchronize(m_last_event, default_timeout));
+        OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, default_timeout));
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
index 1d9118c4d12d7d..6e046ab3a61ccf 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -40,7 +40,7 @@ class ze_kernel : public kernel {
             descriptor.pNext = nullptr;
             descriptor.flags = 0;
             descriptor.pKernelName = _kernel_id.c_str();
-            ZE_CHECK(zeKernelCreate(_module, &descriptor, &cloned_handle));
+            OV_ZE_EXPECT(zeKernelCreate(_module, &descriptor, &cloned_handle));
             return std::make_shared<ze_kernel>(cloned_handle, _module, _kernel_id);
         }
     }
@@ -49,10 +49,10 @@ class ze_kernel : public kernel {
 
     std::vector<uint8_t> get_binary() const override {
         size_t binary_size = 0;
-        ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, nullptr));
+        OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, nullptr));
 
         std::vector<uint8_t> binary(binary_size);
-        ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, &binary[0]));
+        OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, &binary[0]));
 
         return binary;
     }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 3411c5489361a8..54482661e52705 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -44,7 +44,7 @@ std::vector<ze_event_handle_t> get_ze_events(const std::vector<event::ptr>& even
 allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) {
     ze_memory_allocation_properties_t props{ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES};
     ze_device_handle_t device = nullptr;
-    ZE_CHECK(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device));
+    OV_ZE_EXPECT(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device));
 
     switch (props.type) {
         case ZE_MEMORY_TYPE_DEVICE: return allocation_type::usm_device;
@@ -111,14 +111,14 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re
             }
             GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
             _host_buffer.allocateHost(_bytes_count);
-            ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(),
+            OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(),
                                     _host_buffer.get(),
                                     _buffer.get(),
                                     _bytes_count,
                                     nullptr,
                                     0,
                                     nullptr));
-            ZE_CHECK(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout));
+            OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout));
             _mapped_ptr = _host_buffer.get();
         } else {
             _mapped_ptr = _buffer.get();
@@ -145,7 +145,7 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vecto
     auto ev_ze = downcast<ze::ze_base_event>(ev.get())->get_handle();
     std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
     auto ze_dep_events = get_ze_events(dep_events);
-    ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(),
+    OV_ZE_EXPECT(zeCommandListAppendMemoryFill(_ze_stream.get_queue(),
         _buffer.get(),
         temp_buffer.data(),
         1,
@@ -174,7 +174,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o
     auto src_ptr = reinterpret_cast<const char*>(data_ptr) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+    OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
@@ -202,7 +202,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_
     auto src_ptr = reinterpret_cast<const char*>(usm_mem->buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(buffer_ptr()) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+    OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
@@ -226,7 +226,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s
     auto src_ptr = reinterpret_cast<const char*>(buffer_ptr()) + src_offset;
     auto dst_ptr = reinterpret_cast<char*>(data_ptr) + dst_offset;
 
-    ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
+    OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(),
                                            dst_ptr,
                                            src_ptr,
                                            _bytes_count,
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
index b4368dc43b34e0..9089e8a711758b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
@@ -68,7 +68,7 @@ class UsmMemory {
         host_desc.pNext = nullptr;
 
         void* memory = nullptr;
-        ZE_CHECK(zeMemAllocHost(_context, &host_desc, size, 1, &memory));
+        OV_ZE_EXPECT(zeMemAllocHost(_context, &host_desc, size, 1, &memory));
         _allocate(memory);
     }
 
@@ -85,7 +85,7 @@ class UsmMemory {
         host_desc.pNext = nullptr;
 
         void* memory = nullptr;
-        ZE_CHECK(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory));
+        OV_ZE_EXPECT(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory));
         _allocate(memory);
     }
 
@@ -97,7 +97,7 @@ class UsmMemory {
         device_desc.pNext = nullptr;
 
         void* memory = nullptr;
-        ZE_CHECK(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory));
+        OV_ZE_EXPECT(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory));
         _allocate(memory);
     }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 08bfea5e15f343..a16ceb097a46e7 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -212,7 +212,7 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
         GPU_DEBUG_INFO << "Copy offload hint is not supported" << std::endl;
     }
 
-    ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
+    OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
     if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) {
         m_ev_factory = std::make_unique<ze_cb_event_factory>(engine, config.get_enable_profiling());
     } else {
@@ -266,8 +266,8 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel,
     auto global = to_group_count(args_desc.workGroups.global);
     auto local = to_group_count(args_desc.workGroups.local);
     ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ };
-    ZE_CHECK(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ));
-    ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list,
+    OV_ZE_EXPECT(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ));
+    OV_ZE_EXPECT(zeCommandListAppendLaunchKernel(m_command_list,
                                              kern,
                                              &args,
                                              set_output_event ? std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle() : nullptr,
@@ -278,13 +278,13 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel,
 }
 
 void ze_stream::enqueue_barrier() {
-    ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
+    OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
 }
 
 event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, bool is_output) {
     if (deps.empty()) {
         auto ev = create_base_event();
-        ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle(), 0, nullptr));
+        OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle(), 0, nullptr));
         return ev;
     }
 
@@ -300,7 +300,7 @@ event::ptr ze_stream::enqueue_marker(std::vector<ze_event::ptr> const& deps, boo
             return create_user_event(true);
 
         auto ev = create_base_event();
-        ZE_CHECK(zeCommandListAppendBarrier(m_command_list,
+        OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list,
                                             std::dynamic_pointer_cast<ze_base_event>(ev)->get_handle(),
                                             static_cast<uint32_t>(dep_events.size()),
                                             &dep_events.front()));
@@ -339,7 +339,7 @@ void ze_stream::flush() const {
 }
 
 void ze_stream::finish() const {
-    ZE_CHECK(zeCommandListHostSynchronize(m_command_list, default_timeout));
+    OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, default_timeout));
 }
 
 void ze_stream::wait_for_events(const std::vector<event::ptr>& events) {
@@ -374,9 +374,9 @@ void ze_stream::sync_events(std::vector<event::ptr> const& deps, bool is_output)
         if (is_output) {
             m_last_barrier_ev = std::dynamic_pointer_cast<ze_event>(create_base_event());
             m_last_barrier_ev->set_queue_stamp(m_queue_counter.load());
-            ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr));
+            OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr));
         } else {
-            ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
+            OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr));
         }
         m_last_barrier = ++m_queue_counter;
     }

From 3007f7b6f250a793983df193bbd6ba65edc0190a Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 24 Oct 2025 13:44:00 +0000
Subject: [PATCH 38/74] Add default supported simd sizes for L0

---
 .../intel_gpu/src/runtime/ze/ze_device.cpp       | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index d3557f9f30a914..37613e8280d547 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -83,8 +83,12 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     bool supports_dp_properties =
         supports_extension(extensions, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0);
 
-    ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0};
-    ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr};
+    void *device_properties_next = nullptr;
+    ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, device_properties_next, 0};
+    if (supports_ip_version) {
+        device_properties_next = &ip_version_properties;
+    }
+    ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, device_properties_next};
     OV_ZE_EXPECT(zeDeviceGetProperties(device, &device_properties));
 
     ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES};
@@ -142,7 +146,9 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     info.gpu_frequency = device_properties.coreClockRate;
 
-    info.supported_simd_sizes = {};
+    // Set SIMD values as reasonable default for most of the supported platforms
+    // Could not find how to retrieve all supported SIMD sizes from L0
+    info.supported_simd_sizes = {8, 16, 32};
     info.has_separate_cache = true;
 
     info.max_work_group_size = device_compute_properties.maxTotalGroupSize;
@@ -180,7 +186,8 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities;
 
-    info.gfx_ver = {0, 0, 0}; // could find how to retrieve this from L0 so far
+    // Could not find how to retrieve gfx_ver from L0
+    info.gfx_ver = {0, 0, 0};
     info.ip_version = ip_version_properties.ipVersion;
     info.sub_device_idx = (std::numeric_limits<uint32_t>::max)();
 
@@ -209,7 +216,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     }
 
     info.supports_mutable_command_list = false;
-
     if (supports_mutable_list) {
         ze_mutable_command_list_exp_properties_t mutable_list_props = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_PROPERTIES,  nullptr, 0, 0 };
         ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &mutable_list_props};

From ddbe71b6b58855aac15aaf7e24d7888b5a4c5ae1 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 28 Oct 2025 14:21:51 +0000
Subject: [PATCH 39/74] Add L0 supported simd sizes query

---
 src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index 37613e8280d547..5fbbe8e4737a9b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -146,9 +146,8 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     info.gpu_frequency = device_properties.coreClockRate;
 
-    // Set SIMD values as reasonable default for most of the supported platforms
-    // Could not find how to retrieve all supported SIMD sizes from L0
-    info.supported_simd_sizes = {8, 16, 32};
+    info.supported_simd_sizes.resize(device_compute_properties.numSubGroupSizes);
+    std::copy_n(device_compute_properties.subGroupSizes, device_compute_properties.numSubGroupSizes, info.supported_simd_sizes.begin());
     info.has_separate_cache = true;
 
     info.max_work_group_size = device_compute_properties.maxTotalGroupSize;

From 7c01937934215fffb6bfa4f131c68caa5ec12f69 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 29 Oct 2025 15:36:36 +0000
Subject: [PATCH 40/74] Add L0 interface when building kernel selector

---
 src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
index 34b62d232c7e73..c5311f5be101bc 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -75,6 +75,7 @@ if(COMMAND add_cpplint_target)
 endif()
 
 target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::OpenCL openvino::runtime PRIVATE openvino::runtime::dev)
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:rapidjson,INTERFACE_INCLUDE_DIRECTORIES>)
 

From 707a23868d02f0be5750a1491798732c9abdedb0 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 30 Oct 2025 15:31:38 +0000
Subject: [PATCH 41/74] Update L0 OneDNN submodule

---
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
index 633a03d736a265..bbba69f0a58439 160000
--- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -1 +1 @@
-Subproject commit 633a03d736a2656eab1de8d7e7032711bdc4a30b
+Subproject commit bbba69f0a584391d0ab25b548ec76d8c62aa11f5

From 7b2d8ae8be840d07e1436c88471225e5e13e5df2 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 3 Nov 2025 16:06:08 +0000
Subject: [PATCH 42/74] L0 retrieve global cache size

---
 .../intel_gpu/src/runtime/ze/ze_device.cpp    | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index 5fbbe8e4737a9b..ad37f85a4c440b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -139,6 +139,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     info.vendor_id = device_properties.vendorId;
     info.dev_name = device_properties.name;
+    // L0 returns drivers version in different format than OCL
     info.driver_version = std::to_string(driver_properties.driverVersion);
     info.dev_type = (device_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? device_type::integrated_gpu : device_type::discrete_gpu;
 
@@ -152,6 +153,15 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     info.max_work_group_size = device_compute_properties.maxTotalGroupSize;
     info.max_local_mem_size = device_compute_properties.maxSharedLocalMemory;
+    uint32_t cache_properties_count = 0;
+    OV_ZE_EXPECT(zeDeviceGetCacheProperties(device, &cache_properties_count, nullptr));
+    info.max_global_cache_size = 0;
+    if (cache_properties_count > 0) {
+        std::vector<ze_device_cache_properties_t> cache_properties(cache_properties_count);
+        OV_ZE_EXPECT(zeDeviceGetCacheProperties(device, &cache_properties_count, cache_properties.data()));
+        // Assume first property is L3 cache
+        info.max_global_cache_size = cache_properties[0].cacheSize;
+    }
 
     if (mem_properties != device_memory_properties.end()) {
         info.max_global_mem_size = mem_properties->totalSize;
@@ -164,7 +174,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.max_alloc_mem_size = device_properties.maxMemAllocSize;
 
     info.supports_image = device_image_properties.maxSamplers > 0;
-    info.supports_intel_planar_yuv = false;
     info.max_image2d_width = device_image_properties.maxImageDims2D;
     info.max_image2d_height = device_image_properties.maxImageDims2D;
 
@@ -172,11 +181,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_fp64 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP64) != 0;
     info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0;
 
-    info.supports_khr_subgroups = true;
-    info.supports_intel_subgroups = true;
-    info.supports_intel_subgroups_short = true;
-    info.supports_intel_subgroups_char = true;
-    info.supports_intel_required_subgroup_size = true;
     info.supports_cp_offload = supports_cp_offload;
     info.supports_cb_events = supports_cb_events;
 
@@ -185,8 +189,17 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
 
     info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities;
 
-    // Could not find how to retrieve gfx_ver from L0
+    // FIXME: Could not find how to retrieve those from L0
     info.gfx_ver = {0, 0, 0};
+    info.supports_work_group_collective_functions = false;
+    info.supports_intel_planar_yuv = false;
+    info.supports_khr_subgroups = true;
+    info.supports_intel_subgroups = true;
+    info.supports_intel_subgroups_short = true;
+    info.supports_intel_subgroups_char = true;
+    info.supports_intel_required_subgroup_size = true;
+    info.supports_queue_families = true;
+
     info.ip_version = ip_version_properties.ipVersion;
     info.sub_device_idx = (std::numeric_limits<uint32_t>::max)();
 
@@ -197,7 +210,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.num_threads_per_eu = device_properties.numThreadsPerEU;
 
     info.num_ccs = compute_queue_props->numQueues;
-    info.supports_queue_families = true;
 
     info.kernel_timestamp_valid_bits  = device_properties.kernelTimestampValidBits;
     info.timer_resolution  = device_properties.timerResolution;

From cba0d29068c7ed29c7421f40a70d73736c08a2d8 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 4 Nov 2025 16:33:43 +0000
Subject: [PATCH 43/74] Add L0 build kernel API wip

---
 .../include/intel_gpu/runtime/engine.hpp      |  7 +-
 .../include/intel_gpu/runtime/kernel.hpp      |  6 ++
 .../intel_gpu/src/runtime/ocl/ocl_engine.cpp  |  4 ++
 .../intel_gpu/src/runtime/ocl/ocl_engine.hpp  |  2 +-
 .../intel_gpu/src/runtime/ze/ze_common.hpp    |  5 ++
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    | 67 +++++++++++++++----
 .../intel_gpu/src/runtime/ze/ze_engine.hpp    |  2 +-
 .../intel_gpu/src/runtime/ze/ze_kernel.hpp    | 39 +++++------
 .../src/runtime/ze/ze_kernel_holder.hpp       | 33 +++++++++
 .../src/runtime/ze/ze_module_holder.hpp       | 29 ++++++++
 10 files changed, 155 insertions(+), 39 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
index 67648d15fe9994..657526d6174348 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -145,6 +145,9 @@ class engine {
 
     virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0;
 
+    // Build kernels for current engine and append them to output vector.
+    virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const = 0;
+
     void set_enable_large_allocations(bool enable_large_allocations);
 
     bool get_enable_large_allocations() const;
@@ -157,10 +160,6 @@ class engine {
     virtual dnnl::engine& get_onednn_engine() const = 0;
 #endif
 
-    /// This method is intended to create kernel handle for current engine from handle from arbitrary engine
-    /// For instance, source kernel can be compiled using ocl engine, and then we can build L0 kernel object based on that
-    virtual kernel::ptr prepare_kernel(const kernel::ptr kernel) const = 0;
-
     /// Factory method which creates engine object with impl configured by @p engine_type
     /// @param engine_type requested engine type
     /// @param runtime_type requested execution runtime for the engine. @note some runtime/engine types configurations might be unsupported
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
index 6c55df8507c812..b6a1a2dc817209 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
@@ -11,6 +11,12 @@ namespace cldnn {
 
 using kernel_id = std::string;
 
+enum class KernelFormat {
+    SOURCE,
+    IL,
+    NATIVE_BIN,
+};
+
 class kernel {
 public:
     using ptr = std::shared_ptr<kernel>;
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
index 5e7b1c63dddc86..9917ddfe6712a1 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -307,6 +307,10 @@ void* ocl_engine::get_user_context() const {
     return static_cast<void*>(cl_device.get_context().get());
 }
 
+std::vector<kernel::ptr> ocl_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options) const {
+    cl::Program()
+}
+
 kernel::ptr ocl_engine::prepare_kernel(const kernel::ptr kernel) const {
     OPENVINO_ASSERT(downcast<const ocl::ocl_kernel>(kernel.get()) != nullptr);
     return kernel;
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
index 0fa77922f03442..ca505d1ce16812 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
@@ -46,7 +46,7 @@ class ocl_engine : public engine {
     stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
     stream& get_service_stream() const override;
 
-    kernel::ptr prepare_kernel(const kernel::ptr kernel) const override;
+    void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     void create_onednn_engine(const ExecutionConfig& config) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
index 3961bd5776a11e..4ff1100a7dd9e9 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
@@ -3,9 +3,13 @@
 //
 #pragma once
 
+#include "intel_gpu/runtime/debug_configuration.hpp"
+
 #include <ze_api.h>
 
 #include <limits>
+#include <string>
+
 
 // Expect success of level zero command, throw runtime error otherwise
 #define OV_ZE_EXPECT(f) \
@@ -29,6 +33,7 @@ namespace cldnn {
 namespace ze {
 
 static constexpr uint64_t default_timeout = std::numeric_limits<uint64_t>::max();
+static constexpr ze_module_format_t ze_module_format_oclc = (ze_module_format_t) 3U;
 
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index f9e1a4874324ab..48fdade2cdbab9 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -12,6 +12,9 @@
 #include "ze_memory.hpp"
 #include "ze_stream.hpp"
 #include "ze_device.hpp"
+#include "ze_kernel.hpp"
+#include "ze_module_holder.hpp"
+#include "ze_kernel_holder.hpp"
 #include <exception>
 #include <vector>
 #include <memory>
@@ -38,7 +41,7 @@ ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine&
 
     auto ze_device = engine.get_device();
     auto ze_ctx = engine.get_context();
-    zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr);
+    OV_ZE_EXPECT(zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr));
     return ze_module;
 }
 
@@ -219,17 +222,57 @@ bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) {
     return (reinterpret_cast<const ze::gpu_usm&>(mem1).get_buffer().get() == reinterpret_cast<const ze::gpu_usm&>(mem2).get_buffer().get());
 }
 
-kernel::ptr ze_engine::prepare_kernel(const kernel::ptr kernel) const {
-    if (std::dynamic_pointer_cast<const ze_kernel>(kernel)) {
-        return kernel;
-    } else {
-        auto binary = kernel->get_binary();
-        ze_module_handle_t ze_module = ze_create_module_with_level_zero(*this, binary);
-        ze_kernel_handle_t ze_kernel;
-        auto entry_point = kernel->get_id();
-        ze_kernel_desc_t desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC , nullptr, 0, entry_point.c_str()};
-        zeKernelCreate(ze_module, &desc, &ze_kernel);
-        return std::make_shared<cldnn::ze::ze_kernel>(ze_kernel, ze_module, entry_point);
+void ze_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const {
+    ze_module_desc_t module_desc = {
+        ZE_STRUCTURE_TYPE_MODULE_DESC,
+        nullptr,
+        ZE_MODULE_FORMAT_NATIVE,
+        src_bytes,
+        reinterpret_cast<const uint8_t *>(src),
+        options.c_str(),
+        nullptr // specialization constants
+    };
+    switch (src_format)
+    {
+    case KernelFormat::SOURCE:
+        module_desc.format = ze_module_format_oclc;
+        break;
+    case KernelFormat::IL:
+        module_desc.format = ZE_MODULE_FORMAT_IL_SPIRV;
+        break;
+    case KernelFormat::NATIVE_BIN:
+        module_desc.format = ZE_MODULE_FORMAT_NATIVE;
+        break;
+    default:
+        OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format");
+        break;
+    }
+    ze_module_handle_t module_handle;
+    ze_module_build_log_handle_t log_handle;
+    ze_result_t build_result = zeModuleCreate(get_context(), get_device(), &module_desc, &module_handle, &log_handle);
+    if (build_result != ZE_RESULT_SUCCESS) {
+        size_t log_size = 0;
+        OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr));
+        std::string log(log_size, ' ');
+        OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data()));
+        OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle));
+        OPENVINO_THROW(log);
+    }
+    auto module_holder = std::make_shared<ze_module_holder>(module_handle);
+    OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle));
+    uint32_t kernel_count = 0;
+    OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr));
+    std::vector<const char*> kernel_names(kernel_count);
+    OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data()));
+    ze_kernel_flags_t flags = 0;
+    ze_kernel_desc_t kernel_desc = {
+        ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr};
+    for (auto name : kernel_names) {
+        kernel_desc.pKernelName = name;
+        ze_kernel_handle_t kernel_handle;
+        OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle));
+        auto kernel_holder = std::make_shared<ze_kernel_holder>(kernel_handle, module_holder);
+        out.push_back(std::make_shared<ze_kernel>(kernel_holder, std::string(name)));
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
index b75d2ae0ca67eb..fb86467fb9cc07 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
@@ -41,7 +41,7 @@ class ze_engine : public engine {
     stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
     stream& get_service_stream() const override;
 
-    kernel::ptr prepare_kernel(const kernel::ptr kernel) const override;
+    void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     void create_onednn_engine(const ExecutionConfig& config) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
index 6e046ab3a61ccf..61bc03b46f85e2 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -7,6 +7,7 @@
 #include "intel_gpu/runtime/kernel.hpp"
 #include "openvino/core/except.hpp"
 #include "ze_common.hpp"
+#include "ze_kernel_holder.hpp"
 
 #include <memory>
 
@@ -14,48 +15,44 @@ namespace cldnn {
 namespace ze {
 
 class ze_kernel : public kernel {
-    ze_kernel_handle_t _compiled_kernel;
-    ze_module_handle_t _module;
-    std::string _kernel_id;
-
 public:
-    ze_kernel(ze_kernel_handle_t compiled_kernel, ze_module_handle_t module, const std::string& kernel_id)
-        : _compiled_kernel(compiled_kernel)
-        , _module(module)
-        , _kernel_id(kernel_id) { }
+    ze_kernel(std::shared_ptr<ze_kernel_holder> kernel, const std::string& kernel_id)
+        : m_kernel(kernel)
+        , m_kernel_id(kernel_id) { }
 
-    ~ze_kernel() {
-        zeKernelDestroy(_compiled_kernel);
-    }
+    ze_kernel_handle_t get_kernel() { return m_kernel->get_kernel(); }
+    ze_module_handle_t get_module() { return m_kernel->get_module(); }
+    std::string get_id() const override { return m_kernel_id; }
 
-    const ze_kernel_handle_t& get_handle() const { return _compiled_kernel; }
-    ze_kernel_handle_t& get_handle() { return _compiled_kernel; }
     std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const override {
         if (reuse_kernel_handle) {
-            return std::make_shared<ze_kernel>(_compiled_kernel, _module, _kernel_id);
+            return std::make_shared<ze_kernel>(m_kernel, m_kernel_id);
         } else {
             ze_kernel_handle_t cloned_handle;
+            ze_module_handle_t module_handle = m_kernel->get_module();
             ze_kernel_desc_t descriptor;
             descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
             descriptor.pNext = nullptr;
             descriptor.flags = 0;
-            descriptor.pKernelName = _kernel_id.c_str();
-            OV_ZE_EXPECT(zeKernelCreate(_module, &descriptor, &cloned_handle));
-            return std::make_shared<ze_kernel>(cloned_handle, _module, _kernel_id);
+            descriptor.pKernelName = m_kernel_id.c_str();
+            OV_ZE_EXPECT(zeKernelCreate(module_handle, &descriptor, &cloned_handle));
+            return std::make_shared<ze_kernel>(cloned_handle, module_handle, m_kernel_id);
         }
     }
 
-    std::string get_id() const override { return _kernel_id; }
-
     std::vector<uint8_t> get_binary() const override {
         size_t binary_size = 0;
-        OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, nullptr));
+        ze_module_handle_t module_handle = m_kernel->get_module();
+        OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, nullptr));
 
         std::vector<uint8_t> binary(binary_size);
-        OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, &binary[0]));
+        OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, binary.data()));
 
         return binary;
     }
+private:
+    std::shared_ptr<ze_kernel_holder> m_kernel;
+    std::string m_kernel_id;
 };
 
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp
new file mode 100644
index 00000000000000..72c98c4b9283e7
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_common.hpp"
+#include "ze_module_holder.hpp"
+
+#include <memory>
+
+namespace cldnn {
+namespace ze {
+
+// RAII wrapper for Level Zero kernel
+class ze_kernel_holder {
+public:
+    // Take ownership of existing kernel handle
+    explicit ze_kernel_holder(ze_kernel_handle_t kernel, std::shared_ptr<ze_module_holder> module) : m_kernel(kernel), m_module(module) {}
+    ze_kernel_holder(const ze_kernel_holder& other) = delete;
+    ze_kernel_holder& operator=(const ze_kernel_holder& other) = delete;
+    ~ze_kernel_holder() {
+        OV_ZE_WARN(zeKernelDestroy(m_kernel));
+    }
+    ze_kernel_handle_t get_kernel() { return m_kernel; }
+    ze_module_handle_t get_module() { return m_module->get_module(); }
+private:
+    ze_kernel_handle_t m_kernel;
+    std::shared_ptr<ze_module_holder> m_module;
+}
+
+}  // namespace ze
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp
new file mode 100644
index 00000000000000..9288584fc5c681
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ze_common.hpp"
+
+namespace cldnn {
+namespace ze {
+
+// RAII wrapper for Level Zero module
+class ze_module_holder {
+public:
+    // Take ownership of existing module handle
+    explicit ze_module_holder(ze_module_handle_t handle) : m_handle(handle) {}
+
+    ze_module_holder(const ze_module_holder& other) = delete;
+    ze_module_holder& operator=(const ze_module_holder& other) = delete;
+    ~ze_module_holder() {
+        OV_ZE_WARN(zeModuleDestroy(m_handle));
+    }
+    ze_module_handle_t get_module() { return m_handle; }
+private:
+    ze_module_handle_t m_handle;
+
+};
+}  // namespace ze
+}  // namespace cldnn

From 2f855dafa4a69027b23446c70ec5e8f19e784541 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 6 Nov 2025 16:29:30 +0000
Subject: [PATCH 44/74] Add L0 kernel build logic

---
 .../util/include/openvino/util/file_util.hpp  |   2 +-
 src/common/util/src/file_util.cpp             |   2 +-
 .../include/intel_gpu/runtime/engine.hpp      |   6 +-
 .../include/intel_gpu/runtime/file_util.hpp   |   2 +-
 .../include/intel_gpu/runtime/kernel.hpp      |   8 +-
 .../intel_gpu/runtime/kernel_builder.hpp      |  25 ++
 .../src/graph/impls/ocl/kernels_cache.cpp     | 292 +++++-------------
 .../src/graph/impls/ocl/kernels_cache.hpp     |   4 +-
 .../graph/impls/ocl/multi_stage_primitive.hpp |   3 +-
 .../src/graph/impls/ocl/primitive_base.hpp    |   3 +-
 .../intel_gpu/src/runtime/file_util.cpp       |   2 +-
 .../intel_gpu/src/runtime/ocl/ocl_engine.cpp  |  14 +-
 .../intel_gpu/src/runtime/ocl/ocl_engine.hpp  |   2 +-
 .../intel_gpu/src/runtime/ocl/ocl_kernel.cpp  |  11 +
 .../intel_gpu/src/runtime/ocl/ocl_kernel.hpp  |   1 +
 .../src/runtime/ocl/ocl_kernel_builder.hpp    |  78 +++++
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    |  79 +----
 .../intel_gpu/src/runtime/ze/ze_engine.hpp    |   2 +-
 .../intel_gpu/src/runtime/ze/ze_kernel.hpp    |  43 ++-
 .../src/runtime/ze/ze_kernel_builder.hpp      |  67 ++++
 .../src/runtime/ze/ze_kernel_holder.hpp       |   7 +-
 .../src/runtime/ze/ze_module_holder.hpp       |  17 +-
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    |   4 +-
 23 files changed, 331 insertions(+), 343 deletions(-)
 create mode 100644 src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp
 create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp

diff --git a/src/common/util/include/openvino/util/file_util.hpp b/src/common/util/include/openvino/util/file_util.hpp
index 6eee9c3da0c102..88407538a36cc8 100644
--- a/src/common/util/include/openvino/util/file_util.hpp
+++ b/src/common/util/include/openvino/util/file_util.hpp
@@ -316,7 +316,7 @@ std::vector<uint8_t> load_binary(const std::string& path);
  * @brief save binary data to file
  * @param path - binary file path to store
  */
-void save_binary(const std::string& path, std::vector<uint8_t> binary);
+void save_binary(const std::string& path, const std::vector<uint8_t> &binary);
 void save_binary(const std::string& path, const char* binary, size_t bin_size);
 
 /**
diff --git a/src/common/util/src/file_util.cpp b/src/common/util/src/file_util.cpp
index 5436f26f03014a..db39f3a61749d2 100644
--- a/src/common/util/src/file_util.cpp
+++ b/src/common/util/src/file_util.cpp
@@ -515,7 +515,7 @@ std::vector<uint8_t> ov::util::load_binary(const std::string& path) {
     return {};
 }
 
-void ov::util::save_binary(const std::string& path, std::vector<uint8_t> binary) {
+void ov::util::save_binary(const std::string& path, const std::vector<uint8_t> &binary) {
     save_binary(path, reinterpret_cast<const char*>(&binary[0]), binary.size());
     return;
 }
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
index 657526d6174348..1d99a1ae430f80 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -12,6 +12,7 @@
 #include "layout.hpp"
 #include "execution_config.hpp"
 #include "engine_configuration.hpp"
+#include "kernel_builder.hpp"
 
 #include <memory>
 #include <set>
@@ -143,10 +144,9 @@ class engine {
     /// Returns service stream which can be used during program build and optimizations
     virtual stream& get_service_stream() const = 0;
 
-    virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0;
+    virtual std::shared_ptr<kernel_builder> create_kernel_builder() const = 0;
 
-    // Build kernels for current engine and append them to output vector.
-    virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const = 0;
+    virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0;
 
     void set_enable_large_allocations(bool enable_large_allocations);
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp
index 934d9deecd2bd0..91fc07c2ab2555 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp
@@ -12,6 +12,6 @@
 namespace ov::intel_gpu {
 
 // Version of save_binary that don't trow an exception if attempt to open file fails
-void save_binary(const std::string& path, std::vector<uint8_t> binary);
+void save_binary(const std::string& path, const std::vector<uint8_t> &binary);
 
 }  // namespace ov::intel_gpu
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
index b6a1a2dc817209..54bece374b99f2 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
@@ -6,17 +6,12 @@
 
 #include <memory>
 #include <vector>
+#include <string>
 
 namespace cldnn {
 
 using kernel_id = std::string;
 
-enum class KernelFormat {
-    SOURCE,
-    IL,
-    NATIVE_BIN,
-};
-
 class kernel {
 public:
     using ptr = std::shared_ptr<kernel>;
@@ -25,6 +20,7 @@ class kernel {
 
     virtual std::string get_id() const = 0;
     virtual std::vector<uint8_t> get_binary() const = 0;
+    virtual std::string get_build_log() const = 0;
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp
new file mode 100644
index 00000000000000..90c17a77be03cf
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel.hpp"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace cldnn {
+
+enum class KernelFormat {
+    SOURCE,
+    NATIVE_BIN,
+};
+
+class kernel_builder {
+public:
+    virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> &out) const = 0;
+};
+
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
index fc282231b22e28..94e4df64ea8090 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
@@ -20,10 +20,6 @@
 #include "intel_gpu/runtime/itt.hpp"
 #include "intel_gpu/runtime/file_util.hpp"
 
-#include "ocl/ocl_kernel.hpp"
-#include "ocl/ocl_common.hpp"
-#include "ocl/ocl_device.hpp"
-
 #ifdef WIN32
 #include <sdkddkver.h>
 #ifdef NTDDI_WIN10_RS5
@@ -54,41 +50,18 @@
 namespace {
 std::mutex cacheAccessMutex;
 
-static const cldnn::device::ptr get_target_device(const cldnn::engine& engine) {
-    using namespace cldnn;
-    if (engine.runtime_type() == runtime_types::ocl) {
-        return engine.get_device();
-    } else {
-        ocl::ocl_device_detector detector;
-        auto device_map = detector.get_available_devices(nullptr, nullptr);
-        auto original_device = engine.get_device();
-
-        for (auto& d : device_map) {
-            const auto& target_uuid = d.second->get_info().uuid;
-            const auto& original_uuid = original_device->get_info().uuid;
-            if (target_uuid.uuid == original_uuid.uuid)
-                return d.second;
-        }
+std::string join_strings(const std::vector<std::string> strings) {
+    size_t total_size = 0;
+    for (auto &str : strings) {
+        total_size += str.size();
     }
-
-    OPENVINO_THROW("[GPU] Couldn't find target device for kernels cache");
-}
-
-#ifdef ENABLE_ONEDNN_FOR_GPU
-cl::Program fuse_microkernels(const cl::Context& context, const cl::Device& device, cl::Program& program, const std::string& code) {
-    using namespace dnnl::impl::gpu::intel;
-    std::vector<std::vector<uint8_t>> binaries = program.getInfo<CL_PROGRAM_BINARIES>();
-    OPENVINO_ASSERT(binaries.size() == 1);
-    std::vector<uint8_t> binary = binaries[0];
-    micro::fuseMicrokernels(binary, code.c_str());
-
-    cl::Program::Binaries fused_binary = { binary };
-    cl::Program fused_program(context, {device}, fused_binary);
-    fused_program.build({device});
-
-    return fused_program;
+    std::string acc_str;
+    acc_str.reserve(total_size);
+    for (auto &str : strings) {
+        acc_str.append(str);
+    }
+    return acc_str;
 }
-#endif  // ENABLE_ONEDNN_FOR_GPU
 
 std::string reorder_options(const std::string& org_options) {
     std::stringstream ss(org_options);
@@ -307,42 +280,21 @@ kernels_cache::kernels_cache(engine& engine,
                              uint32_t prog_id,
                              std::shared_ptr<ov::threading::ITaskExecutor> task_executor,
                              const std::map<std::string, std::string>& batch_headers)
-    : _device(get_target_device(engine))
+    : _device(engine.get_device())
+    , _builder(engine.create_kernel_builder())
     , _task_executor(task_executor)
     , _config(config)
     , _prog_id(prog_id)
     , batch_headers(std::move(batch_headers)) { }
 
-static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
-    // Get the size of the program binary in bytes.
-    std::vector<size_t> binary_sizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
-
-    if (binary_sizes.size() != 1)
-        throw std::runtime_error("Invalid binaries count");
-
-    size_t binary_size = binary_sizes.front();
-    // Binary is not available for the device.
-    if (binary_size == 0)
-        throw std::runtime_error("Binary is not avaliable after program build");
-
-    // Get program binary.
-    return program.getInfo<CL_PROGRAM_BINARIES>().front();
-}
-
-// TODO: This build_batch method should be backend specific
 void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& compiled_kernels) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch");
-
-    auto& cl_build_device = dynamic_cast<const ocl::ocl_device&>(*_device);
-
     bool dump_sources = batch.dump_custom_program;
     std::string dump_sources_dir = GPU_DEBUG_VALUE_OR(_config.get_dump_sources_path(), "");
     GPU_DEBUG_IF(!dump_sources_dir.empty()) {
         dump_sources = true;
     }
 
-    std::string err_log;  // accumulated build log from all program's parts (only contains messages from parts which
-
     std::string current_dump_file_name = "";
     if (dump_sources) {
         current_dump_file_name = std::move(dump_sources_dir);
@@ -361,128 +313,71 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co
                 dump_file << s;
         }
     }
-
     std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache";
-    cl::Program::Binaries precompiled_kernels = {};
-
+    ///////////////////////////////////////////////////////////////////////////////////
+    std::vector<uint8_t> precompiled;
     if (is_cache_enabled()) {
-        // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
-        // If read is successful, then remove kernels from compilation bucket
-        std::vector<uint8_t> bin;
-        {
-            std::lock_guard<std::mutex> lock(cacheAccessMutex);
-            bin = ov::util::load_binary(cached_bin_name);
-        }
-        if (!bin.empty()) {
-            precompiled_kernels.push_back(bin);
-        }
+        std::lock_guard<std::mutex> lock(cacheAccessMutex);
+        precompiled = ov::util::load_binary(cached_bin_name);
     }
-    try {
-        cl::vector<cl::Kernel> kernels;
-
-        // Run compilation
-        if (precompiled_kernels.empty()) {
-            cl::Program program(cl_build_device.get_context(), batch.source);
-            {
-                OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildProgram::RunCompilation");
-                if (program.build({cl_build_device.get_device()}, batch.options.c_str()) != CL_SUCCESS)
-                    throw std::runtime_error("Failed in building program.");
-            }
-
-            if (dump_sources && dump_file.good()) {
-                dump_file << "\n/* Build Log:\n";
-                for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
-                    dump_file << p.second << "\n";
-
-                dump_file << "*/\n";
+    std::vector<kernel::ptr> kernels;
+    if (!precompiled.empty()) {
+        _builder->build_kernels(precompiled.data(), precompiled.size(), KernelFormat::NATIVE_BIN, "", kernels);
+    } else {
+        auto combined_source = join_strings(batch.source);
+        _builder->build_kernels(combined_source.data(), combined_source.size(), KernelFormat::SOURCE, batch.options, kernels);
+        if (dump_sources && dump_file.good()) {
+            dump_file << "\n/* Build Log:\n";
+            // Retreive build log from the first kernel only
+            // It should be the same for all kernels in batch
+            if (kernels.size() > 1) {
+                dump_file << kernels[0]->get_build_log();
             }
-
-            if (batch.has_microkernels) {
+            dump_file << "\n*/\n";
+        }
+        if (batch.has_microkernels) {
 #ifdef ENABLE_ONEDNN_FOR_GPU
-                OPENVINO_ASSERT(batch.kernels_counter == 1);
-                // Do we need full source code here (with batch headers)?
-                program = fuse_microkernels(cl_build_device.get_context(), cl_build_device.get_device(), program, batch.source.back());
+            using namespace dnnl::impl::gpu::intel;
+            OPENVINO_ASSERT(batch.kernels_counter == 1 && kernels.size() == 1);
+            std::vector<uint8_t> binary = kernels[0]->get_binary();
+            kernels.clear();
+            // Update binary and rebuild kernel
+            micro::fuseMicrokernels(binary, combined_source.c_str());
+            _builder->build_kernels(binary.data(), binary.size(), KernelFormat::NATIVE_BIN, "", kernels);
 #else  // ENABLE_ONEDNN_FOR_GPU
-                OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available");
+            OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available");
 #endif  // ENABLE_ONEDNN_FOR_GPU
-            }
-
-
-            program.createKernels(&kernels);
-
-            if (is_cache_enabled()) {
+        }
+        if (is_cache_enabled()) {
                 // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
                 // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
                 // Bucket size can be changed by max_kernels_per_batch config option, but forcing it to 1 will lead to much longer
                 // compile time.
+                std::vector<uint8_t> binary = kernels[0]->get_binary();
                 std::lock_guard<std::mutex> lock(cacheAccessMutex);
-                ov::intel_gpu::save_binary(cached_bin_name, getProgramBinaries(std::move(program)));
-            }
-        } else {
-            cl::Program program(cl_build_device.get_context(), {cl_build_device.get_device()}, precompiled_kernels);
-            if (program.build({cl_build_device.get_device()}, batch.options.c_str()) != CL_SUCCESS)
-                throw std::runtime_error("Failed in building program with a precompiled kernel.");
-
-            program.createKernels(&kernels);
+                ov::intel_gpu::save_binary(cached_bin_name, binary);
         }
-
-        {
-            std::lock_guard<std::mutex> lock(_mutex);
-            for (auto& k : kernels) {
-                const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
-                const auto& iter = batch.entry_point_to_id.find(entry_point);
-                if (iter != batch.entry_point_to_id.end()) {
-                    kernel::ptr kernel = std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, cl_build_device.get_usm_helper()), entry_point);
-
-                    auto& params = iter->second.first;
-                    auto kernel_part_idx = iter->second.second;
-                    if (compiled_kernels.find(params) != compiled_kernels.end()) {
-                        compiled_kernels[params].push_back(std::make_pair(kernel, kernel_part_idx));
-                    } else {
-                        compiled_kernels[params] = { std::make_pair(kernel, kernel_part_idx) };
-                    }
-                    if (_kernel_batch_hash.find(params) == _kernel_batch_hash.end()) {
-                       _kernel_batch_hash[params] = batch.hash_value;
-                    }
+    }
+    {
+        std::lock_guard<std::mutex> lock(_mutex);
+        for (auto& k : kernels) {
+            auto entry_point = k->get_id();
+            const auto& iter = batch.entry_point_to_id.find(entry_point);
+            if (iter != batch.entry_point_to_id.end()) {
+                auto& params = iter->second.first;
+                auto kernel_part_idx = iter->second.second;
+                if (compiled_kernels.find(params) != compiled_kernels.end()) {
+                    compiled_kernels[params].push_back(std::make_pair(k, kernel_part_idx));
                 } else {
-                    throw std::runtime_error("Could not find entry point");
+                    compiled_kernels[params] = { std::make_pair(k, kernel_part_idx) };
                 }
+                if (_kernel_batch_hash.find(params) == _kernel_batch_hash.end()) {
+                   _kernel_batch_hash[params] = batch.hash_value;
+                }
+            } else {
+                throw std::runtime_error("Could not find entry point");
             }
         }
-    } catch (const cl::BuildError& err) {
-        if (dump_sources && dump_file.good())
-            dump_file << "\n/* Build Log:\n";
-
-        for (auto& p : err.getBuildLog()) {
-            if (dump_sources && dump_file.good())
-                dump_file << p.second << "\n";
-            err_log += p.second + '\n';
-        }
-        if (dump_sources && dump_file.good())
-            dump_file << "*/\n";
-    }
-    if (!err_log.empty()) {
-        GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl;
-        GPU_DEBUG_INFO << err_log << std::endl;
-        GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl;
-        std::stringstream err_ss(err_log);
-        std::string line;
-        std::stringstream err;
-        int cnt = 0;
-
-        while (std::getline(err_ss, line, '\n')) {
-            if (line.find("error") != std::string::npos)
-                cnt = 5;
-            cnt--;
-            if (cnt > 0)
-                err << line << std::endl;
-            else if (cnt == 0)
-                err << "...." << std::endl;
-        }
-
-        throw std::runtime_error("Program build failed(" + std::to_string(batch.bucket_id) + + "_part_"
-                                 + std::to_string(batch.batch_id)
-                                 + "):\n" + err.str());
     }
 }
 
@@ -490,7 +385,7 @@ kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const
     auto res = _cached_kernels.find(id);
     OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");
 
-    return e.prepare_kernel(res->second->clone(_reuse_kernels));
+    return res->second->clone(_reuse_kernels);
 }
 
 std::vector<kernel::ptr> kernels_cache::get_kernels(const kernel_impl_params& params) const {
@@ -504,53 +399,15 @@ std::vector<kernel::ptr> kernels_cache::get_kernels(const kernel_impl_params& pa
     OPENVINO_ASSERT(_kernels.end() != res, "Kernel for {" + current_node_id + "} is not found in the kernel cache!");
     OPENVINO_ASSERT(res->second.size() != 0, "Number of kernels should not be zero for " + current_node_id);
 
-    auto& engine = params.get_program().get_engine();
-
     std::vector<kernel::ptr> kernels(res->second.size());
     for (auto& k : res->second) {
         auto& kernel_ptr = k.first;
         auto kernel_part_idx = k.second;
-        kernels[kernel_part_idx] = engine.prepare_kernel(kernel_ptr->clone(_reuse_kernels));
+        kernels[kernel_part_idx] = kernel_ptr->clone(_reuse_kernels);
     }
     return kernels;
 }
 
-bool kernels_cache::validate_simple_kernel_execution(kernel::ptr krl) {
-    auto casted = downcast<ocl::ocl_kernel>(krl.get());
-    auto kernel = casted->get_handle();
-    try {
-        auto casted_dev = dynamic_cast<ocl::ocl_device*>(_device.get());
-        OPENVINO_ASSERT(casted_dev != nullptr, "device is nullptr");
-
-        auto device = casted_dev->get_device();
-        cl::Context ctx(device);
-
-        cl::Buffer buffer(ctx, CL_MEM_READ_WRITE, sizeof(uint8_t) * 8);
-        if (kernel.setArg(0, buffer) != CL_SUCCESS)
-            return false;
-
-        cl::Event ev;
-        cl::CommandQueue queue(ctx, device);
-        if (queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(8), cl::NDRange(8), nullptr, &ev) != CL_SUCCESS)
-            return false;
-
-        uint8_t result[8];
-        uint8_t expected[8] = { 1, 3, 5, 7, 9, 11, 13, 15 };
-        if (queue.enqueueReadBuffer(buffer, CL_TRUE, 0, sizeof(uint8_t) * 8, &result) != CL_SUCCESS)
-            return false;
-
-        for (int i = 0; i < 8; ++i) {
-            if (result[i] != expected[i])
-                return false;
-        }
-
-        ev.wait();
-        return true;
-    } catch (...) {
-        return false;
-    }
-}
-
 void kernels_cache::build_all() {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll");
     if (!_pending_compilation)
@@ -693,7 +550,7 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const {
         ob << cached_binary.first;
         ob << is_zebin_binary;
         if (!is_zebin_binary) {
-            auto driver_version = downcast<ocl::ocl_device>(*_device).get_info().driver_version;
+            auto driver_version = _device->get_info().driver_version;
             ob << driver_version;
         }
     }
@@ -702,8 +559,6 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const {
 void kernels_cache::load(BinaryInputBuffer& ib) {
     std::unordered_map<uint32_t, std::vector<unsigned char>> precompiled_kernels;
 
-    const auto& build_device = downcast<ocl::ocl_device>(*_device);
-
     size_t num_cached_binaries;
     ib >> num_cached_binaries;
     for (size_t i = 0; i < num_cached_binaries; ++i) {
@@ -717,7 +572,7 @@ void kernels_cache::load(BinaryInputBuffer& ib) {
             // Legacy patchtoken path
             std::string driver_version, current_driver_version;
             ib >> driver_version;
-            current_driver_version = build_device.get_info().driver_version;
+            current_driver_version = _device->get_info().driver_version;
 
             if (driver_version != current_driver_version) {
                 OPENVINO_THROW("Driver version mismatch in cached patchtoken kernels");
@@ -725,31 +580,22 @@ void kernels_cache::load(BinaryInputBuffer& ib) {
         }
     }
 
-    try {
+    {
         std::lock_guard<std::mutex> lock(_mutex);
         _cached_kernels.clear();
 
         for (auto& precompiled_kernel : precompiled_kernels) {
-            cl::vector<cl::Kernel> kernels;
-            cl::Program program(build_device.get_context(), {build_device.get_device()}, {precompiled_kernel.second});
-            program.build({build_device.get_device()});
-            program.createKernels(&kernels);
-
+            std::vector<kernel::ptr> kernels;
+            _builder->build_kernels(precompiled_kernel.second.data(), precompiled_kernel.second.size(), KernelFormat::NATIVE_BIN, "", kernels);
             for (auto& k : kernels) {
-                const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
+                const auto& entry_point = k->get_id();
                 std::string cached_kernel_id = entry_point + "@" + std::to_string(precompiled_kernel.first);
                 const auto& iter = _cached_kernels.find(cached_kernel_id);
                 if (iter == _cached_kernels.end()) {
-                    _cached_kernels[cached_kernel_id] = std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, build_device.get_usm_helper()), entry_point);
+                    _cached_kernels[cached_kernel_id] = k;
                 }
             }
         }
-    } catch (const cl::BuildError& err) {
-        std::string err_log = "";
-        for (auto& p : err.getBuildLog()) {
-            err_log += p.second + '\n';
-        }
-        OPENVINO_THROW(err_log);
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
index e623c547bf691d..ee65b68be0138c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
@@ -7,6 +7,7 @@
 #include "intel_gpu/graph/serialization/binary_buffer.hpp"
 #include "intel_gpu/runtime/device.hpp"
 #include "intel_gpu/runtime/kernel.hpp"
+#include "intel_gpu/runtime/kernel_builder.hpp"
 #include "intel_gpu/runtime/execution_config.hpp"
 #include "intel_gpu/graph/kernel_impl_params.hpp"
 
@@ -97,6 +98,7 @@ class kernels_cache {
 private:
     static std::mutex _mutex;
     const device::ptr _device;
+    std::shared_ptr<kernel_builder> _builder;
     std::shared_ptr<ov::threading::ITaskExecutor> _task_executor;
     ExecutionConfig _config;
     uint32_t _prog_id = 0;
@@ -127,8 +129,6 @@ class kernels_cache {
     void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; }
     bool get_kernels_reuse() const { return _reuse_kernels; }
 
-    bool validate_simple_kernel_execution(kernel::ptr kernel);
-
     // forces compilation of all pending kernels/programs
     void build_all();
     void reset();
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
index 482184340171c5..3da69b2aa48bfc 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
@@ -221,12 +221,11 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
     void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
         OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed.");
         auto& kernel_vec = kernels.begin()->second;
-        auto& engine = kernels.begin()->first.get_program().get_engine();
         _kernels.clear();
         _kernels.resize(kernel_vec.size());
         for (auto& k : kernel_vec) {
             auto sub_kernel_idx = k.second;
-            _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first);
+            _kernels[sub_kernel_idx] = k.first;
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
index 40f04650658264..b897bbac550196 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -297,12 +297,11 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
         OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed.");
         auto& kernel_vec = kernels.begin()->second;
-        auto& engine = kernels.begin()->first.get_program().get_engine();
         _kernels.clear();
         _kernels.resize(kernel_vec.size());
         for (auto& k : kernel_vec) {
             auto sub_kernel_idx = k.second;
-            _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first);
+            _kernels[sub_kernel_idx] = k.first;
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/runtime/file_util.cpp b/src/plugins/intel_gpu/src/runtime/file_util.cpp
index e73397f1e51844..fc64b8c8af200c 100644
--- a/src/plugins/intel_gpu/src/runtime/file_util.cpp
+++ b/src/plugins/intel_gpu/src/runtime/file_util.cpp
@@ -7,7 +7,7 @@
 
 namespace ov::intel_gpu {
 
-void save_binary(const std::string &path, std::vector<uint8_t> binary) {
+void save_binary(const std::string &path, const std::vector<uint8_t> &binary) {
     try {
         ov::util::save_binary(path, binary);
     } catch (std::runtime_error&) {}
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
index 9917ddfe6712a1..b3a5c2266f165e 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -4,7 +4,8 @@
 
 #include "ocl_engine.hpp"
 #include "intel_gpu/runtime/utils.hpp"
-#include "ocl/ocl_kernel.hpp"
+#include "ocl_kernel.hpp"
+#include "ocl_kernel_builder.hpp"
 #include "ocl_common.hpp"
 #include "ocl_memory.hpp"
 #include "ocl_stream.hpp"
@@ -307,13 +308,10 @@ void* ocl_engine::get_user_context() const {
     return static_cast<void*>(cl_device.get_context().get());
 }
 
-std::vector<kernel::ptr> ocl_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options) const {
-    cl::Program()
-}
-
-kernel::ptr ocl_engine::prepare_kernel(const kernel::ptr kernel) const {
-    OPENVINO_ASSERT(downcast<const ocl::ocl_kernel>(kernel.get()) != nullptr);
-    return kernel;
+std::shared_ptr<kernel_builder> ocl_engine::create_kernel_builder() const {
+    auto cl_device = std::dynamic_pointer_cast<ocl_device>(_device);
+    OPENVINO_ASSERT(cl_device, "[GPU] Invalid device type for ocl_engine");
+    return std::make_shared<ocl_kernel_builder>(*cl_device);
 }
 
 bool ocl_engine::extension_supported(std::string extension) const {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
index ca505d1ce16812..84908c7addc969 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
@@ -46,7 +46,7 @@ class ocl_engine : public engine {
     stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
     stream& get_service_stream() const override;
 
-    void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const override;
+    std::shared_ptr<kernel_builder> create_kernel_builder() const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     void create_onednn_engine(const ExecutionConfig& config) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp
index 417a0286df7252..7a59b3c4169b3b 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp
@@ -38,5 +38,16 @@ std::vector<uint8_t> ocl_kernel::get_binary() const {
     return binary;
 }
 
+std::string ocl_kernel::get_build_log() const {
+    auto program = _compiled_kernel.getInfo<CL_KERNEL_PROGRAM>();
+    auto log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>();
+    // Assume program was build for only 1 device
+    // Return first log
+    if (log.size() > 0) {
+        return log[0].second;
+    }
+    OPENVINO_THROW("[GPU] Failed to retrieve kernel build log");
+}
+
 }  // namespace ocl
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
index 035182f664df4d..d46fda1ee02f15 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
@@ -35,6 +35,7 @@ class ocl_kernel : public kernel {
     }
 
     std::vector<uint8_t> get_binary() const override;
+    std::string get_build_log() const override;
 };
 
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp
new file mode 100644
index 00000000000000..7ded6cf5b8e837
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp
@@ -0,0 +1,78 @@
+// Copyright (C) 2016-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/kernel_builder.hpp"
+#include "intel_gpu/runtime/device.hpp"
+
+#include "ocl_device.hpp"
+#include "ocl_kernel.hpp"
+
+
+namespace cldnn {
+namespace ocl {
+
+class ocl_kernel_builder : public kernel_builder{
+    public:
+        ocl_kernel_builder(const ocl_device &device) : m_device(device) {}
+
+        void build_kernels(const void *src,
+            size_t src_bytes,
+            KernelFormat src_format,
+            const std::string &options,
+            std::vector<kernel::ptr> &out) const override {
+            auto context = m_device.get_context().get();
+
+            cl_program program_handle;
+            cl_int err = CL_INVALID_VALUE;
+            switch (src_format) {
+            case KernelFormat::SOURCE: {
+                const char **strings = reinterpret_cast<const char**>(&src);
+                const size_t *lenghts = &src_bytes;
+                const cl_uint count = 1;
+                program_handle = clCreateProgramWithSource(context, count, strings, lenghts, &err);
+                break;
+            }
+            case KernelFormat::NATIVE_BIN: {
+                const unsigned char **binaries = reinterpret_cast<const unsigned char**>(&src);
+                const size_t *lenghts = &src_bytes;
+                const cl_device_id device_id = m_device.get_device().get();
+                const cl_uint count = 1;
+                program_handle = clCreateProgramWithBinary(context, count, &device_id, lenghts, binaries, nullptr, &err);
+                break;
+            }
+            default:
+                OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format");
+                break;
+            }
+            if (err != CL_SUCCESS) {
+                OPENVINO_THROW("[GPU] Failed to create program during kernel build process");
+            }
+            cl::Program program(program_handle);
+            if (program.build(m_device.get_device(), options.c_str()) != CL_SUCCESS) {
+                GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl;
+                auto log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>();
+                for (auto &e : log) {
+                    GPU_DEBUG_INFO << e.second;
+                }
+                GPU_DEBUG_INFO << "-------- End of Kernel build error" << std::endl;
+                OPENVINO_THROW("[GPU] Failed to build program");
+            }
+            cl::vector<cl::Kernel> kernels;
+            if (program.createKernels(&kernels) != CL_SUCCESS) {
+                OPENVINO_THROW("[GPU] Failed to create kernels");
+            }
+            for (auto& k : kernels) {
+                const auto &entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
+                out.push_back(std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, m_device.get_usm_helper()), entry_point));
+            }
+    }
+
+    private:
+        const ocl_device &m_device;
+};
+}  // namespace ocl
+}  // namespace cldnn
+
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index 48fdade2cdbab9..45c453009454e9 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -5,7 +5,7 @@
 #include "ze_engine.hpp"
 #include "intel_gpu/runtime/utils.hpp"
 #include "openvino/core/except.hpp"
-#include "ze/ze_kernel.hpp"
+#include "ze_kernel_builder.hpp"
 #include "ze_api.h"
 #include "ze_engine_factory.hpp"
 #include "ze_common.hpp"
@@ -26,27 +26,6 @@
 namespace cldnn {
 namespace ze {
 
-namespace {
-
-ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& engine, std::vector<uint8_t> binary) {
-    auto desc = ze_module_desc_t();
-    desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
-    desc.format = ZE_MODULE_FORMAT_NATIVE;
-    desc.inputSize = binary.size();
-    desc.pInputModule = binary.data();
-    desc.pBuildFlags = "";
-    desc.pConstants = nullptr;
-
-    ze_module_handle_t ze_module;
-
-    auto ze_device = engine.get_device();
-    auto ze_ctx = engine.get_context();
-    OV_ZE_EXPECT(zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr));
-    return ze_module;
-}
-
-}  // namespace
-
 ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type)
     : engine(dev) {
     OPENVINO_ASSERT(runtime_type == runtime_types::ze, "[GPU] Invalid runtime type specified for ZE engine. Only ZE runtime is supported");
@@ -222,58 +201,10 @@ bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) {
     return (reinterpret_cast<const ze::gpu_usm&>(mem1).get_buffer().get() == reinterpret_cast<const ze::gpu_usm&>(mem2).get_buffer().get());
 }
 
-void ze_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const {
-    ze_module_desc_t module_desc = {
-        ZE_STRUCTURE_TYPE_MODULE_DESC,
-        nullptr,
-        ZE_MODULE_FORMAT_NATIVE,
-        src_bytes,
-        reinterpret_cast<const uint8_t *>(src),
-        options.c_str(),
-        nullptr // specialization constants
-    };
-    switch (src_format)
-    {
-    case KernelFormat::SOURCE:
-        module_desc.format = ze_module_format_oclc;
-        break;
-    case KernelFormat::IL:
-        module_desc.format = ZE_MODULE_FORMAT_IL_SPIRV;
-        break;
-    case KernelFormat::NATIVE_BIN:
-        module_desc.format = ZE_MODULE_FORMAT_NATIVE;
-        break;
-    default:
-        OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format");
-        break;
-    }
-    ze_module_handle_t module_handle;
-    ze_module_build_log_handle_t log_handle;
-    ze_result_t build_result = zeModuleCreate(get_context(), get_device(), &module_desc, &module_handle, &log_handle);
-    if (build_result != ZE_RESULT_SUCCESS) {
-        size_t log_size = 0;
-        OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr));
-        std::string log(log_size, ' ');
-        OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data()));
-        OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle));
-        OPENVINO_THROW(log);
-    }
-    auto module_holder = std::make_shared<ze_module_holder>(module_handle);
-    OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle));
-    uint32_t kernel_count = 0;
-    OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr));
-    std::vector<const char*> kernel_names(kernel_count);
-    OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data()));
-    ze_kernel_flags_t flags = 0;
-    ze_kernel_desc_t kernel_desc = {
-        ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr};
-    for (auto name : kernel_names) {
-        kernel_desc.pKernelName = name;
-        ze_kernel_handle_t kernel_handle;
-        OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle));
-        auto kernel_holder = std::make_shared<ze_kernel_holder>(kernel_handle, module_holder);
-        out.push_back(std::make_shared<ze_kernel>(kernel_holder, std::string(name)));
-    }
+std::shared_ptr<kernel_builder> ze_engine::create_kernel_builder() const {
+    auto casted = std::dynamic_pointer_cast<ze_device>(_device);
+    OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine");
+    return std::make_shared<ze_kernel_builder>(*casted);
 }
 
 void* ze_engine::get_user_context() const {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
index fb86467fb9cc07..586b830a2de93e 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
@@ -41,7 +41,7 @@ class ze_engine : public engine {
     stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
     stream& get_service_stream() const override;
 
-    void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> out) const override;
+    std::shared_ptr<kernel_builder> create_kernel_builder() const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     void create_onednn_engine(const ExecutionConfig& config) override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
index 61bc03b46f85e2..4944942ab6c201 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -16,12 +16,33 @@ namespace ze {
 
 class ze_kernel : public kernel {
 public:
+    static void create_kernels_from_module(std::shared_ptr<ze_module_holder> module, std::vector<kernel::ptr> &out) {
+        ze_module_handle_t module_handle = module->get_module_handle();
+        uint32_t kernel_count = 0;
+        OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr));
+        std::vector<const char*> kernel_names(kernel_count);
+        // Specification does not mention who is responsible for the returned pointers
+        // Assume Level Zero owns the pointers and they will remain valid as long as the module resource
+        OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data()));
+
+        ze_kernel_flags_t flags = 0;
+        ze_kernel_desc_t kernel_desc = {
+            ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr};
+        for (auto name : kernel_names) {
+            kernel_desc.pKernelName = name;
+            ze_kernel_handle_t kernel_handle;
+            OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle));
+            auto kernel_holder = std::make_shared<ze_kernel_holder>(kernel_handle, module);
+            out.push_back(std::make_shared<ze_kernel>(kernel_holder, std::string(name)));
+        }
+    }
+
     ze_kernel(std::shared_ptr<ze_kernel_holder> kernel, const std::string& kernel_id)
         : m_kernel(kernel)
         , m_kernel_id(kernel_id) { }
 
-    ze_kernel_handle_t get_kernel() { return m_kernel->get_kernel(); }
-    ze_module_handle_t get_module() { return m_kernel->get_module(); }
+    ze_kernel_handle_t get_kernel_handle() const { return m_kernel->get_kernel_handle(); }
+    ze_module_handle_t get_module_handle() const { return m_kernel->get_module()->get_module_handle(); }
     std::string get_id() const override { return m_kernel_id; }
 
     std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const override {
@@ -29,20 +50,21 @@ class ze_kernel : public kernel {
             return std::make_shared<ze_kernel>(m_kernel, m_kernel_id);
         } else {
             ze_kernel_handle_t cloned_handle;
-            ze_module_handle_t module_handle = m_kernel->get_module();
+            ze_module_handle_t module_handle = get_module_handle();
             ze_kernel_desc_t descriptor;
             descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
             descriptor.pNext = nullptr;
             descriptor.flags = 0;
             descriptor.pKernelName = m_kernel_id.c_str();
             OV_ZE_EXPECT(zeKernelCreate(module_handle, &descriptor, &cloned_handle));
-            return std::make_shared<ze_kernel>(cloned_handle, module_handle, m_kernel_id);
+            auto kernel_holder = std::make_shared<ze_kernel_holder>(cloned_handle, m_kernel->get_module());
+            return std::make_shared<ze_kernel>(kernel_holder, m_kernel_id);
         }
     }
 
     std::vector<uint8_t> get_binary() const override {
         size_t binary_size = 0;
-        ze_module_handle_t module_handle = m_kernel->get_module();
+        ze_module_handle_t module_handle = get_module_handle();
         OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, nullptr));
 
         std::vector<uint8_t> binary(binary_size);
@@ -50,6 +72,17 @@ class ze_kernel : public kernel {
 
         return binary;
     }
+
+    std::string get_build_log() const override {
+        ze_module_build_log_handle_t build_log_handle = m_kernel->get_module()->get_build_log_handle();
+        size_t log_size = 0;
+        OV_ZE_EXPECT(zeModuleBuildLogGetString(build_log_handle, &log_size, nullptr));
+
+        std::string log(log_size, ' ');
+        OV_ZE_EXPECT(zeModuleBuildLogGetString(build_log_handle, &log_size, log.data()));
+        return log;
+    }
+
 private:
     std::shared_ptr<ze_kernel_holder> m_kernel;
     std::string m_kernel_id;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp
new file mode 100644
index 00000000000000..7aae55d6a7f990
--- /dev/null
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2016-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/runtime/kernel_builder.hpp"
+#include "intel_gpu/runtime/device.hpp"
+
+#include "ze_device.hpp"
+#include "ze_kernel.hpp"
+#include "ze_common.hpp"
+
+namespace cldnn {
+namespace ze {
+
+class ze_kernel_builder : public kernel_builder{
+    public:
+        ze_kernel_builder(const ze_device &device) : m_device(device) {}
+
+    void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector<kernel::ptr> &out) const override {
+        ze_module_desc_t module_desc = {
+            ZE_STRUCTURE_TYPE_MODULE_DESC,
+            nullptr,
+            ZE_MODULE_FORMAT_NATIVE,
+            src_bytes,
+            reinterpret_cast<const uint8_t *>(src),
+            options.c_str(),
+            nullptr // specialization constants
+        };
+        switch (src_format) {
+        case KernelFormat::SOURCE: {
+            module_desc.format = ze_module_format_oclc;
+            break;
+        }
+        case KernelFormat::NATIVE_BIN: {
+            module_desc.format = ZE_MODULE_FORMAT_NATIVE;
+            break;
+        }
+        default:
+            OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format");
+            break;
+        }
+        ze_module_handle_t module_handle;
+        ze_module_build_log_handle_t log_handle;
+        ze_result_t build_result = zeModuleCreate(m_device.get_context(), m_device.get_device(), &module_desc, &module_handle, &log_handle);
+        if (build_result != ZE_RESULT_SUCCESS) {
+            size_t log_size = 0;
+            OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr));
+            std::string log(log_size, ' ');
+            OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data()));
+            OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle));
+            GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl;
+            GPU_DEBUG_INFO << log << std::endl;
+            GPU_DEBUG_INFO << "-------- End of Kernel build error" << std::endl;
+            OPENVINO_THROW("[GPU] Failed to build module");
+        }
+        auto module_holder = std::make_shared<ze_module_holder>(module_handle, log_handle);
+        ze_kernel::create_kernels_from_module(module_holder, out);
+    }
+
+    private:
+        const ze_device &m_device;
+};
+}  // namespace ze
+}  // namespace cldnn
+
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp
index 72c98c4b9283e7..5d7ab17b6b8367 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp
@@ -22,12 +22,11 @@ class ze_kernel_holder {
     ~ze_kernel_holder() {
         OV_ZE_WARN(zeKernelDestroy(m_kernel));
     }
-    ze_kernel_handle_t get_kernel() { return m_kernel; }
-    ze_module_handle_t get_module() { return m_module->get_module(); }
+    ze_kernel_handle_t get_kernel_handle() { return m_kernel; }
+    std::shared_ptr<ze_module_holder> get_module() { return m_module; }
 private:
     ze_kernel_handle_t m_kernel;
     std::shared_ptr<ze_module_holder> m_module;
-}
-
+};
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp
index 9288584fc5c681..fcd5c5840b935f 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp
@@ -5,6 +5,8 @@
 #pragma once
 
 #include "ze_common.hpp"
+#include <vector>
+#include <string>
 
 namespace cldnn {
 namespace ze {
@@ -12,18 +14,21 @@ namespace ze {
 // RAII wrapper for Level Zero module
 class ze_module_holder {
 public:
-    // Take ownership of existing module handle
-    explicit ze_module_holder(ze_module_handle_t handle) : m_handle(handle) {}
+    // Take ownership of existing module and build log handles
+    explicit ze_module_holder(ze_module_handle_t module, ze_module_build_log_handle_t build_log) : m_module(module), m_build_log(build_log) {}
 
     ze_module_holder(const ze_module_holder& other) = delete;
     ze_module_holder& operator=(const ze_module_holder& other) = delete;
     ~ze_module_holder() {
-        OV_ZE_WARN(zeModuleDestroy(m_handle));
+        OV_ZE_WARN(zeModuleBuildLogDestroy(m_build_log));
+        OV_ZE_WARN(zeModuleDestroy(m_module));
     }
-    ze_module_handle_t get_module() { return m_handle; }
-private:
-    ze_module_handle_t m_handle;
+    ze_module_handle_t get_module_handle() const { return m_module; }
+    ze_module_build_log_handle_t get_build_log_handle() const { return m_build_log; }
 
+private:
+    ze_module_handle_t m_module;
+    ze_module_build_log_handle_t m_build_log;
 };
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index a16ceb097a46e7..ac081e262383a9 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -234,7 +234,7 @@ void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_
     std::lock_guard<std::mutex> guard(m);
 
     auto& ze_kernel = downcast<ze::ze_kernel>(kernel);
-    auto& kern = ze_kernel.get_handle();
+    auto kern = ze_kernel.get_kernel_handle();
     set_arguments_impl(kern, args_desc.arguments, args);
 }
 
@@ -245,7 +245,7 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel,
                                      bool is_output) {
     auto& ze_kernel = downcast<ze::ze_kernel>(kernel);
 
-    auto& kern = ze_kernel.get_handle();
+    auto kern = ze_kernel.get_kernel_handle();
 
     std::vector<ze_event_handle_t> dep_events;
     std::vector<ze_event_handle_t>* dep_events_ptr = nullptr;

From 9ba3c9cd28d9313d2bbff5e889623cc771b5238e Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 7 Nov 2025 09:40:03 +0000
Subject: [PATCH 45/74] Fix style

---
 src/common/util/include/openvino/util/file_util.hpp | 2 +-
 src/common/util/src/file_util.cpp                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/util/include/openvino/util/file_util.hpp b/src/common/util/include/openvino/util/file_util.hpp
index 88407538a36cc8..bfb60e8a126b82 100644
--- a/src/common/util/include/openvino/util/file_util.hpp
+++ b/src/common/util/include/openvino/util/file_util.hpp
@@ -316,7 +316,7 @@ std::vector<uint8_t> load_binary(const std::string& path);
  * @brief save binary data to file
  * @param path - binary file path to store
  */
-void save_binary(const std::string& path, const std::vector<uint8_t> &binary);
+void save_binary(const std::string& path, const std::vector<uint8_t>& binary);
 void save_binary(const std::string& path, const char* binary, size_t bin_size);
 
 /**
diff --git a/src/common/util/src/file_util.cpp b/src/common/util/src/file_util.cpp
index db39f3a61749d2..22dd2daf3ca75f 100644
--- a/src/common/util/src/file_util.cpp
+++ b/src/common/util/src/file_util.cpp
@@ -515,7 +515,7 @@ std::vector<uint8_t> ov::util::load_binary(const std::string& path) {
     return {};
 }
 
-void ov::util::save_binary(const std::string& path, const std::vector<uint8_t> &binary) {
+void ov::util::save_binary(const std::string& path, const std::vector<uint8_t>& binary) {
     save_binary(path, reinterpret_cast<const char*>(&binary[0]), binary.size());
     return;
 }

From d0f157c9e54162b53e683229f653ad02426331ac Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 7 Nov 2025 16:05:19 +0000
Subject: [PATCH 46/74] Remove OpenCL dependency when running L0

* removed opencl from targets: openvino_intel_gpu_kernels, openvino_intel_gpu_runtime
* opencl will still be linked to openvino gpu plugin even with L0 rt
---
 src/plugins/intel_gpu/CMakeLists.txt                |  2 +-
 src/plugins/intel_gpu/cmake/utils.cmake             |  9 ++++-----
 .../intel_gpu/include/intel_gpu/runtime/event.hpp   | 10 ++++++++++
 .../intel_gpu/include/intel_gpu/runtime/memory.hpp  |  2 --
 .../intel_gpu/include/intel_gpu/runtime/stream.hpp  |  3 ++-
 .../src/graph/impls/onednn/primitive_onednn_base.h  | 11 ++++-------
 src/plugins/intel_gpu/src/graph/network.cpp         |  6 +++---
 .../intel_gpu/src/kernel_selector/CMakeLists.txt    |  2 +-
 src/plugins/intel_gpu/src/runtime/CMakeLists.txt    |  7 +++++--
 src/plugins/intel_gpu/src/runtime/device_query.cpp  |  2 ++
 src/plugins/intel_gpu/src/runtime/engine.cpp        |  2 ++
 src/plugins/intel_gpu/src/runtime/memory.cpp        | 13 -------------
 src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp | 10 ----------
 src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp |  5 -----
 .../intel_gpu/src/runtime/ocl/ocl_stream.cpp        |  5 +++++
 .../intel_gpu/src/runtime/ocl/ocl_stream.hpp        |  1 +
 src/plugins/intel_gpu/src/runtime/stream.cpp        | 11 -----------
 .../intel_gpu/src/runtime/ze/ze_engine_factory.hpp  |  2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp  |  2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp  |  7 ++++++-
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp  |  1 +
 21 files changed, 49 insertions(+), 64 deletions(-)

diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt
index 38e3dafbc3305f..531047df7dbf10 100644
--- a/src/plugins/intel_gpu/CMakeLists.txt
+++ b/src/plugins/intel_gpu/CMakeLists.txt
@@ -77,7 +77,7 @@ ov_add_plugin(NAME ${TARGET_NAME}
 target_compile_options(${TARGET_NAME} PRIVATE
     $<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)
 
-target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph openvino::pugixml)
+target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL openvino_intel_gpu_graph openvino::pugixml)
 
 target_include_directories(${TARGET_NAME} PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/include/)
diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake
index 5b61368a5d4742..f346f50f8792cf 100644
--- a/src/plugins/intel_gpu/cmake/utils.cmake
+++ b/src/plugins/intel_gpu/cmake/utils.cmake
@@ -4,12 +4,11 @@
 
 function(ov_gpu_set_runtime_interface_for TARGET_NAME)
     if(GPU_RT_TYPE STREQUAL "L0")
-        target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1)
-        target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero)
+        target_compile_definitions(${TARGET_NAME} PRIVATE OV_GPU_WITH_ZE_RT=1)
+        target_link_libraries(${TARGET_NAME} PRIVATE LevelZero::LevelZero)
     elseif(GPU_RT_TYPE STREQUAL "OCL")
-        target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1)
-        # Do not link OpenCL as It is already linked to the targets that require it
-        # target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL)
+        target_compile_definitions(${TARGET_NAME} PRIVATE OV_GPU_WITH_OCL_RT=1)
+        target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL)
     else()
         message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported")
     endif()
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp
index dffa64173fe72c..5451cd47c7daf2 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp
@@ -11,6 +11,7 @@
 #include <utility>
 #include <utility>
 #include <functional>
+#include <optional>
 
 namespace cldnn {
 struct user_event;
@@ -32,6 +33,15 @@ struct event {
         _profiling_captured = false;
         _profiling_info.clear();
     }
+    // Set event profiling data instead of retrieving it from event object
+    void set_profiling(uint64_t duration_nsec) {
+        auto stage = instrumentation::profiling_stage::executing;
+        auto duration = std::chrono::nanoseconds(duration_nsec);
+        auto period = std::make_shared<instrumentation::profiling_period_basic>(duration);
+
+        _profiling_info.push_back({ stage, period });
+        _profiling_captured = true;
+    }
 
     // returns true if handler has been successfully added
     bool add_event_handler(event_handler handler, void* data);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
index 67843abbe86ec0..4e0a1191982c0b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -223,8 +223,6 @@ struct surfaces_lock {
 
     surfaces_lock(const surfaces_lock& other) = delete;
     surfaces_lock& operator=(const surfaces_lock& other) = delete;
-
-    static std::unique_ptr<surfaces_lock> create(engine_types engine_type, std::vector<memory::ptr> mem, const stream& stream);
 };
 
 template<typename T>
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
index 1e8300f92135e7..98f6d87066ac34 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
@@ -17,6 +17,7 @@
 #endif
 
 namespace cldnn {
+struct surfaces_lock;
 
 // Possible sync methods for kernels in stream
 enum class SyncMethods {
@@ -66,12 +67,12 @@ class stream {
     virtual void wait_for_events(const std::vector<event::ptr>& events) = 0;
     virtual event::ptr create_user_event(bool set) = 0;
     virtual event::ptr create_base_event() = 0;
+    virtual std::unique_ptr<surfaces_lock> create_surfaces_lock(const std::vector<memory::ptr> &mem) const = 0;
     virtual event::ptr aggregate_events(const std::vector<event::ptr>& events, bool group = false, bool is_output = false);
 
     QueueTypes get_queue_type() const { return m_queue_type; }
     SyncMethods get_sync_method() const { return m_sync_method; }
 
-    static QueueTypes detect_queue_type(engine_types engine_type, void* queue_handle);
     static SyncMethods get_expected_sync_method(const ExecutionConfig& config);
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
index 38034280659e0e..122cb04d8c7640 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@@ -12,7 +12,6 @@
 #include "intel_gpu/runtime/file_util.hpp"
 #include "to_string_utils.h"
 #include "utils.hpp"
-#include "runtime/ocl/ocl_event.hpp"
 
 #include "intel_gpu/primitives/reorder.hpp"
 
@@ -544,8 +543,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
             try {
                 _prim.execute(stream.get_onednn_stream(), _args[net_id]);
             } catch (dnnl::error& err) {
-                auto err_code = err.status == dnnl_status_t::dnnl_out_of_memory ? CL_OUT_OF_RESOURCES : CL_INVALID_OPERATION;
-                ocl::rethrow(err.what(), err_code, _engine->get_device_info());
+                OPENVINO_THROW(err.what());
             }
 
             if (_enable_profiling) {
@@ -554,12 +552,11 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
                 stream.wait();
 
                 std::vector<uint64_t> duration = dnnl::get_profiling_data(stream.get_onednn_stream(), dnnl::profiling_data_kind::time);
-                if (duration.empty()) {
-                    event = std::make_shared<ocl::ocl_event>(0);
-                } else {
+                event = stream.create_user_event(true);
+                if (!duration.empty()) {
                     OPENVINO_ASSERT(duration.size() == 1, "[GPU] oneDNN profiling data is expected to have info only for single primitive ",
                                                       "actual number is ", duration.size());
-                    event = std::make_shared<ocl::ocl_event>(duration[0]);
+                    event->set_profiling(duration[0]);
                 }
 
             } else {
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 72310ad7ba1c28..36fce1bc89b44a 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -721,13 +721,13 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
         }
     }
 
-    // We shouldn't call surfaces_lock::create() function constantly here, but due to
+    // We shouldn't call create_surfaces_lock function constantly here, but due to
     // some changes in assembler code, performance drops in case if we move it under
     // `shared_mem_found` condition (it somehow connected with get_cl_queue() - this function call
-    // makes asm faster for some reasons). So, as WA we keep this surfaces_lock::create() here
+    // makes asm faster for some reasons). So, as WA we keep this create_surfaces_lock here
     // with empty memory vector and do nothing inside this function for saving performance
     // in some cases.
-    auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());
+    auto surf_lock = get_stream().create_surfaces_lock(in_out_mem);
 
     execute_impl(dependencies);
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
index c5311f5be101bc..fe8582b1a0449c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -74,7 +74,7 @@ if(COMMAND add_cpplint_target)
   add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 endif()
 
-target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::OpenCL openvino::runtime PRIVATE openvino::runtime::dev)
+target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE openvino::runtime::dev)
 ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:rapidjson,INTERFACE_INCLUDE_DIRECTORIES>)
diff --git a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt
index 384ee9c0bef7a5..997dcc617791f2 100644
--- a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt
@@ -27,7 +27,11 @@ if(NOT OV_COMPILER_IS_INTEL_LLVM)
     list(REMOVE_ITEM LIBRARY_SOURCES_OCL ${SYCL_SOURCES})
 endif()
 
-set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN} ${LIBRARY_SOURCES_OCL})
+set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN})
+
+if(GPU_RT_TYPE STREQUAL "OCL")
+    list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_OCL})
+endif()
 
 if(GPU_RT_TYPE STREQUAL "L0")
     list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_ZE})
@@ -69,7 +73,6 @@ ov_set_threading_interface_for(${TARGET_NAME})
 ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 target_link_libraries(${TARGET_NAME} PRIVATE
-    OpenCL::OpenCL
     openvino::itt
     openvino::runtime::dev
   )
diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp
index 8a0ee71944483c..7ec5d6d90edabe 100644
--- a/src/plugins/intel_gpu/src/runtime/device_query.cpp
+++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp
@@ -18,12 +18,14 @@ device_query::device_query(engine_types engine_type,
                            int target_tile_id,
                            bool initialize_devices) {
     switch (runtime_type) {
+#ifdef OV_GPU_WITH_OCL_RT
     case runtime_types::ocl: {
         OPENVINO_ASSERT(engine_type == engine_types::ocl || engine_type == engine_types::sycl);
         ocl::ocl_device_detector ocl_detector;
         _available_devices = ocl_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices);
         break;
     }
+#endif
 #ifdef OV_GPU_WITH_ZE_RT
     case runtime_types::ze: {
         OPENVINO_ASSERT(engine_type == engine_types::ze);
diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp
index ea7820f57353be..f35d4d577230f4 100644
--- a/src/plugins/intel_gpu/src/runtime/engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/engine.cpp
@@ -263,9 +263,11 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_
         ret = ocl::create_sycl_engine(device, runtime_type);
         break;
 #endif  // OV_GPU_WITH_SYCL
+#ifdef OV_GPU_WITH_OCL_RT
     case engine_types::ocl:
         ret = ocl::create_ocl_engine(device, runtime_type);
         break;
+#endif
 #ifdef OV_GPU_WITH_ZE_RT
     case engine_types::ze:
         ret = ze::create_ze_engine(device, runtime_type);
diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp
index a720ee7ec4ea31..f61aa202b9b4bc 100644
--- a/src/plugins/intel_gpu/src/runtime/memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory.cpp
@@ -7,8 +7,6 @@
 #include "intel_gpu/runtime/stream.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
 
-#include "ocl/ocl_memory.hpp"
-
 #include <string>
 #include <vector>
 #include <memory>
@@ -45,15 +43,4 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, std::
     : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), m_mem_tracker(mem_tracker), _type(type) {
 }
 
-std::unique_ptr<surfaces_lock> surfaces_lock::create(engine_types engine_type, std::vector<memory::ptr> mem, const stream& stream) {
-    switch (engine_type) {
-    case engine_types::sycl:
-    case engine_types::ocl:
-        return std::unique_ptr<ocl::ocl_surfaces_lock>(new ocl::ocl_surfaces_lock(mem, stream));
-    case engine_types::ze:
-        return nullptr; // TODO: implement once we have support for surface sharing
-    default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create");
-    }
-}
-
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp
index 29a27e5ea6acee..26fbe6fd272487 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp
@@ -100,16 +100,6 @@ static const std::vector<profiling_period_ocl_start_stop> profiling_periods{
 };
 
 bool ocl_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
-    if (duration_nsec.has_value()) {
-        auto stage = instrumentation::profiling_stage::executing;
-        auto duration = std::chrono::nanoseconds(duration_nsec.value());
-        auto period = std::make_shared<instrumentation::profiling_period_basic>(duration);
-
-        info.push_back({ stage, period });
-
-        return true;
-    }
-
     if (!is_event_profiled(_event))
         return true;
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp
index d51b7de50167b1..7efb87d8775405 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp
@@ -24,10 +24,6 @@ struct ocl_event : public ocl_base_event {
         : ocl_base_event(queue_stamp)
         , _event(ev) {}
 
-    ocl_event(uint64_t duration_nsec, uint64_t queue_stamp = 0)
-        : ocl_base_event(queue_stamp)
-        , duration_nsec(duration_nsec) {}
-
     cl::Event& get() override { return _event; }
 
 private:
@@ -45,7 +41,6 @@ struct ocl_event : public ocl_base_event {
 
 protected:
     cl::Event _event;
-    std::optional<uint64_t> duration_nsec;
 };
 
 struct ocl_events : public ocl_base_event {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
index a0cb17ba40f615..a7a02d69f205f7 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
@@ -11,6 +11,7 @@
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #include "ocl_kernel.hpp"
 #include "ocl_common.hpp"
+#include "ocl_memory.hpp"
 
 #include <cassert>
 #include <string>
@@ -366,6 +367,10 @@ event::ptr ocl_stream::create_base_event() {
     return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);
 }
 
+std::unique_ptr<surfaces_lock> ocl_stream::create_surfaces_lock(const std::vector<memory::ptr> &mem) const {
+    return std::unique_ptr<ocl::ocl_surfaces_lock>(new ocl::ocl_surfaces_lock(mem, *this));
+}
+
 void ocl_stream::flush() const {
     try {
         get_cl_queue().flush();
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp
index b9c51ccb046508..c86089bce08b46 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp
@@ -47,6 +47,7 @@ class ocl_stream : public stream {
     void enqueue_barrier() override;
     event::ptr create_user_event(bool set) override;
     event::ptr create_base_event() override;
+    std::unique_ptr<surfaces_lock> create_surfaces_lock(const std::vector<memory::ptr> &mem) const override;
 
     const cl::UsmHelper& get_usm_helper() const { return _engine.get_usm_helper(); }
 
diff --git a/src/plugins/intel_gpu/src/runtime/stream.cpp b/src/plugins/intel_gpu/src/runtime/stream.cpp
index 913d84d8f476f5..2bd8a74f857a4d 100644
--- a/src/plugins/intel_gpu/src/runtime/stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/stream.cpp
@@ -4,21 +4,10 @@
 
 #include "intel_gpu/runtime/stream.hpp"
 
-#include "ocl/ocl_stream.hpp"
-
 #include <stdexcept>
 
 namespace cldnn {
 
-QueueTypes stream::detect_queue_type(engine_types engine_type, void* queue_handle) {
-    switch (engine_type) {
-        case engine_types::sycl:
-        case engine_types::ocl:
-            return ocl::ocl_stream::detect_queue_type(queue_handle);
-        default: throw std::runtime_error("Invalid engine type");
-    }
-}
-
 SyncMethods stream::get_expected_sync_method(const ExecutionConfig& config) {
     auto profiling = config.get_enable_profiling();
     auto queue_type = config.get_queue_type();
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp
index 40c944a1ca7512..abd2946a2d8e56 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp
@@ -10,7 +10,7 @@
 namespace cldnn {
 namespace ze {
 
-// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible CL includes conflict
+// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible L0 includes conflict
 // between different engines in engine.cpp file
 std::shared_ptr<cldnn::engine> create_ze_engine(const device::ptr device, runtime_types runtime_type);
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
index 680753d6409084..9fadd2c6f1a06b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
@@ -26,7 +26,7 @@ struct ze_events : public ze_base_event {
     }
 
     std::optional<ze_kernel_timestamp_result_t> query_timestamp() override { return std::nullopt; }
-    ze_event_handle_t get_handle() const { return m_last_event; }
+    ze_event_handle_t get_handle() const override { return m_last_event; }
     bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
 
 protected:
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index ac081e262383a9..920c0ac1aa0c83 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -334,8 +334,13 @@ event::ptr ze_stream::create_base_event() {
     return m_ev_factory->create_event(++m_queue_counter);
 }
 
+std::unique_ptr<surfaces_lock> ze_stream::create_surfaces_lock(const std::vector<memory::ptr> &mem) const {
+    // Level Zero egnine currently does not support surfaces lock
+    return nullptr;
+}
+
 void ze_stream::flush() const {
-    //Immediate Command List submits commands immediately - no flush impl
+    // Immediate Command List submits commands immediately - no flush impl
 }
 
 void ze_stream::finish() const {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
index 03e9c12a578a9a..06647fda325304 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp
@@ -48,6 +48,7 @@ class ze_stream : public stream {
     void enqueue_barrier() override;
     event::ptr create_user_event(bool set) override;
     event::ptr create_base_event() override;
+    std::unique_ptr<surfaces_lock> create_surfaces_lock(const std::vector<memory::ptr> &mem) const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     dnnl::stream& get_onednn_stream() override;

From 4ec431e29e25573f2cc5654301e5496ca0f83f1c Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 12 Nov 2025 09:58:48 +0000
Subject: [PATCH 47/74] Fix OCL program build

---
 src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp
index 7ded6cf5b8e837..97b624e4915b47 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp
@@ -51,7 +51,7 @@ class ocl_kernel_builder : public kernel_builder{
                 OPENVINO_THROW("[GPU] Failed to create program during kernel build process");
             }
             cl::Program program(program_handle);
-            if (program.build(m_device.get_device(), options.c_str()) != CL_SUCCESS) {
+            if (program.build({m_device.get_device()}, options.c_str()) != CL_SUCCESS) {
                 GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl;
                 auto log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>();
                 for (auto &e : log) {

From 3e6b368b585c1087a23b1c1db75e12fae5245831 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 17 Nov 2025 15:33:16 +0000
Subject: [PATCH 48/74] Add gfx_ver parsing for L0

---
 .../intel_gpu/src/runtime/ze/ze_device.cpp    | 32 +++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index ad37f85a4c440b..ef09a71950d266 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -55,6 +55,32 @@ gpu_arch convert_ngen_arch(ngen::HW gpu_arch) {
 }
 #endif
 
+gfx_version parse_version(uint32_t gmdid) {
+    union GMDID {
+        uint32_t value;
+        struct {
+            uint32_t revision : 6;
+            uint32_t reserved : 8;
+            uint32_t release : 8;
+            uint32_t architecture : 10;
+        };
+    };
+
+    GMDID gmd_id = {gmdid};
+    if (gmd_id.architecture > 0 && gmd_id.architecture < 100) {
+        // New format
+        return { static_cast<uint16_t>(gmd_id.architecture), static_cast<uint8_t>(gmd_id.release), static_cast<uint8_t>(gmd_id.revision)};
+    } else {
+        // Old format
+        uint32_t ver = gmdid;
+        uint16_t major = ver >> 16;
+        uint8_t minor = (ver >> 8) & 0xFF;
+        uint8_t revision = ver & 0xFF;
+
+        return {major, minor, revision};
+    }
+}
+
 bool supports_extension(const std::vector<ze_driver_extension_properties_t>& extensions, const std::string& ext_name, uint32_t ext_ver) {
     return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) {
         return std::string(ep.name) == ext_name && ep.version == ext_ver;
@@ -190,7 +216,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities;
 
     // FIXME: Could not find how to retrieve those from L0
-    info.gfx_ver = {0, 0, 0};
     info.supports_work_group_collective_functions = false;
     info.supports_intel_planar_yuv = false;
     info.supports_khr_subgroups = true;
@@ -200,7 +225,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_intel_required_subgroup_size = true;
     info.supports_queue_families = true;
 
-    info.ip_version = ip_version_properties.ipVersion;
+    if (supports_ip_version) {
+        info.ip_version = ip_version_properties.ipVersion;
+        info.gfx_ver = parse_version(ip_version_properties.ipVersion);
+    }
     info.sub_device_idx = (std::numeric_limits<uint32_t>::max)();
 
     info.device_id = device_properties.deviceId;

From cfdbb0263b064dcdff1d24f33f730971c1960a6e Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 18 Nov 2025 16:38:43 +0000
Subject: [PATCH 49/74] Skip L0 symbol table kernel

---
 src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
index 4944942ab6c201..3a726e24930d24 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -28,12 +28,17 @@ class ze_kernel : public kernel {
         ze_kernel_flags_t flags = 0;
         ze_kernel_desc_t kernel_desc = {
             ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr};
-        for (auto name : kernel_names) {
-            kernel_desc.pKernelName = name;
+        for (auto name_cstr : kernel_names) {
+            auto name = std::string(name_cstr);
+            // L0 returns Intel_Symbol_Table_Void_Program that does not correspond to actual kernel
+            if (name == "Intel_Symbol_Table_Void_Program") {
+                continue;
+            }
+            kernel_desc.pKernelName = name_cstr;
             ze_kernel_handle_t kernel_handle;
             OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle));
             auto kernel_holder = std::make_shared<ze_kernel_holder>(kernel_handle, module);
-            out.push_back(std::make_shared<ze_kernel>(kernel_holder, std::string(name)));
+            out.push_back(std::make_shared<ze_kernel>(kernel_holder, name));
         }
     }
 

From 04b086a79ad4bbe03ecb15c7e74663455041079b Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 20 Nov 2025 13:44:53 +0000
Subject: [PATCH 50/74] Avoid cl_mem usage with L0 engine

---
 .../include/intel_gpu/runtime/memory_caps.hpp | 13 ++++++++
 .../intel_gpu/src/plugin/remote_tensor.cpp    |  9 ++++-
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    |  4 +--
 .../intel_gpu/src/runtime/ze/ze_memory.hpp    | 33 ++++++++-----------
 4 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
index 78d4b99b32f99a..4d04792bae0bb0 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
@@ -81,6 +81,19 @@ enum class shared_mem_type {
     shared_mem_usm
 };
 
+inline std::ostream& operator<<(std::ostream& out, const shared_mem_type& mem_type) {
+    switch (mem_type) {
+        case shared_mem_type::shared_mem_empty:     out << "shared_mem_empty"; break;
+        case shared_mem_type::shared_mem_buffer:    out << "shared_mem_buffer"; break;
+        case shared_mem_type::shared_mem_image:     out << "shared_mem_image"; break;
+        case shared_mem_type::shared_mem_vasurface: out << "shared_mem_vasurface"; break;
+        case shared_mem_type::shared_mem_dxbuffer:  out << "shared_mem_dxbuffer"; break;
+        case shared_mem_type::shared_mem_usm:       out << "shared_mem_usm"; break;
+        default: out << "unknown"; break;
+    }
+    return out;
+}
+
 using shared_handle = void*;
 using shared_surface = uint32_t;
 
diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
index c2524c8ab3d1a9..b62a95f5f2bd62 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
@@ -317,7 +317,14 @@ void RemoteTensorImpl::allocate() {
 
     switch (m_mem_type) {
     case TensorType::BT_BUF_INTERNAL: {
-        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset);
+        // BT_BUF_INTERNAL should map to cl_mem however L0 engine can not allocate cl_mem
+        if (engine.supports_allocation(cldnn::allocation_type::cl_mem)) {
+            m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset);
+        } else {
+            // Fall back to usm_host and override memory type
+            m_mem_type = TensorType::BT_USM_HOST_INTERNAL;
+            m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset);
+        }
         break;
     }
     case TensorType::BT_USM_HOST_INTERNAL: {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index 45c453009454e9..6b64c617560ce1 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -153,7 +153,7 @@ memory::ptr ze_engine::reinterpret_buffer(const memory& memory, const layout& ne
                                      memory.get_mem_tracker());
     }
 
-    return nullptr;
+    OPENVINO_THROW("[GPU] Trying to reinterpret non usm buffer");
 }
 
 memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_params params) {
@@ -167,7 +167,7 @@ memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_p
                             ") than specified layout (", requested_mem_size, ")");
         return std::make_shared<ze::gpu_usm>(this, new_layout, usm_buffer, nullptr);
     } else {
-        return nullptr;
+        OPENVINO_THROW("[GPU] Unsupported shared memory type: ", params.mem_type);
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
index 9089e8a711758b..c76d6c46cc31fe 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp
@@ -26,16 +26,19 @@ struct lockable_gpu_mem {
 
 class UsmHolder {
 public:
-    UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) { }
+    UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) {
+        if (ptr == nullptr)
+            OPENVINO_THROW("[GPU] Can not create UsmHolder with nullptr");
+    }
+    UsmHolder(const UsmHolder&) = delete;
+    UsmHolder& operator=(const UsmHolder&) = delete;
+
     void* ptr() { return _ptr; }
     void memFree() {
-        try {
-            if (!_shared_memory)
-                zeMemFree(_context, _ptr);
-        } catch (...) {
-            // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly
+        if (!_shared_memory && _ptr != nullptr) {
+            OV_ZE_WARN(zeMemFree(_context, _ptr));
+            _ptr = nullptr;
         }
-        _ptr = nullptr;
     }
 
     ~UsmHolder() {
@@ -58,7 +61,6 @@ class UsmMemory {
         , _device(device)
         , _usm_pointer(std::make_shared<UsmHolder>(_context, reinterpret_cast<uint8_t*>(usm_ptr) + offset, true)) {}
 
-    // Get methods returns original pointer allocated by openCL.
     void* get() const { return _usm_pointer->ptr(); }
 
     void allocateHost(size_t size) {
@@ -69,7 +71,7 @@ class UsmMemory {
 
         void* memory = nullptr;
         OV_ZE_EXPECT(zeMemAllocHost(_context, &host_desc, size, 1, &memory));
-        _allocate(memory);
+        _usm_pointer = std::make_shared<UsmHolder>(_context, memory);
     }
 
     void allocateShared(size_t size, uint32_t ordinal) {
@@ -86,7 +88,7 @@ class UsmMemory {
 
         void* memory = nullptr;
         OV_ZE_EXPECT(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory));
-        _allocate(memory);
+        _usm_pointer = std::make_shared<UsmHolder>(_context, memory);
     }
 
     void allocateDevice(size_t size, uint32_t ordinal) {
@@ -98,12 +100,12 @@ class UsmMemory {
 
         void* memory = nullptr;
         OV_ZE_EXPECT(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory));
-        _allocate(memory);
+        _usm_pointer = std::make_shared<UsmHolder>(_context, memory);
     }
 
     void freeMem() {
         if (!_usm_pointer)
-            throw std::runtime_error("[CL ext] Can not free memory of empty UsmHolder");
+            OPENVINO_THROW("[GPU] Can not free memory of empty UsmHolder");
         _usm_pointer->memFree();
     }
 
@@ -113,13 +115,6 @@ class UsmMemory {
     ze_context_handle_t _context;
     ze_device_handle_t _device;
     std::shared_ptr<UsmHolder> _usm_pointer = nullptr;
-
-private:
-    void _allocate(void* ptr) {
-        if (!ptr)
-            throw std::runtime_error("[CL ext] Can not allocate nullptr for USM type.");
-        _usm_pointer = std::make_shared<UsmHolder>(_context, ptr);
-    }
 };
 
 struct gpu_usm : public lockable_gpu_mem, public memory {

From 6c6664d6f0cbccb01a50988749e2eea12ca97dfa Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 24 Nov 2025 21:40:45 +0100
Subject: [PATCH 51/74] Fix windows build

---
 src/plugins/intel_gpu/CMakeLists.txt                        | 2 +-
 .../intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp    | 4 ++++
 src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp      | 1 -
 src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt    | 1 -
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt             | 6 ++++++
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu              | 2 +-
 6 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt
index 531047df7dbf10..38e3dafbc3305f 100644
--- a/src/plugins/intel_gpu/CMakeLists.txt
+++ b/src/plugins/intel_gpu/CMakeLists.txt
@@ -77,7 +77,7 @@ ov_add_plugin(NAME ${TARGET_NAME}
 target_compile_options(${TARGET_NAME} PRIVATE
     $<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)
 
-target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL openvino_intel_gpu_graph openvino::pugixml)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph openvino::pugixml)
 
 target_include_directories(${TARGET_NAME} PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/include/)
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp
index 66b16345984bc9..9d8bd8aa410e00 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp
@@ -8,11 +8,15 @@
 # define NOMINMAX
 #endif
 
+
+// Do not include DirectX / VA wrappers when running with L0 runtime as they depend on OCL
+#ifndef OV_GPU_WITH_ZE_RT
 #ifdef _WIN32
 # include <openvino/runtime/intel_gpu/ocl/dx.hpp>
 #else
 # include <openvino/runtime/intel_gpu/ocl/va.hpp>
 #endif
+#endif
 #include "openvino/runtime/iremote_tensor.hpp"
 
 #include "intel_gpu/runtime/memory_caps.hpp"
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
index 4420f00deae4c9..cdb486c13cbb9d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
@@ -5,7 +5,6 @@
 #include "utils.hpp"
 #include <oneapi/dnnl/dnnl_debug.h>
 #include <numeric>
-#include <oneapi/dnnl/dnnl_ocl.hpp>
 
 namespace cldnn {
 namespace onednn {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
index fe8582b1a0449c..76652d40e77c1d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -75,7 +75,6 @@ if(COMMAND add_cpplint_target)
 endif()
 
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE openvino::runtime::dev)
-ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:rapidjson,INTERFACE_INCLUDE_DIRECTORIES>)
 
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index d876d52f97baed..916559e401be4c 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -166,6 +166,12 @@ if(ENABLE_ONEDNN_FOR_GPU)
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config"
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include"
                              "${ONEDNN_GPU_DIR}/third_party/ngen")
+        if(GPU_RUNTIME STREQUAL "L0")
+            # OneDNN LevelZero headers can be found at ${ONEDNN_GPU_DIR}\third_party\level_zero\
+            # However adding ${ONEDNN_GPU_DIR}\third_party to include dirs will clash with other dependencies like gtest
+            # Workaround: Use OpenVINO LevelZero target instead
+            list(APPEND LIB_INCLUDE_DIRS $<TARGET_PROPERTY:LevelZero::LevelZero,INTERFACE_INCLUDE_DIRECTORIES>)
+        endif()
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
                             DNNL_DLL
                             DNNL_DLL_EXPORTS
diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
index bbba69f0a58439..5d3bb07d57b485 160000
--- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -1 +1 @@
-Subproject commit bbba69f0a584391d0ab25b548ec76d8c62aa11f5
+Subproject commit 5d3bb07d57b4853d3d9643a11117a4d7c5a380f2

From a3eac100163d7d98d879630cb640a08b75319096 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 27 Nov 2025 15:48:48 +0000
Subject: [PATCH 52/74] Remove unnecessary vector during L0 mem fill

---
 src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 54482661e52705..d86d1fe3db8fc2 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -143,17 +143,16 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vecto
     auto& _ze_stream = downcast<ze_stream>(stream);
     auto ev = _ze_stream.create_base_event();
     auto ev_ze = downcast<ze::ze_base_event>(ev.get())->get_handle();
-    std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
     auto ze_dep_events = get_ze_events(dep_events);
     OV_ZE_EXPECT(zeCommandListAppendMemoryFill(_ze_stream.get_queue(),
         _buffer.get(),
-        temp_buffer.data(),
-        1,
+        &pattern,
+        sizeof(unsigned char),
         _bytes_count,
         ev_ze,
         ze_dep_events.size(),
         ze_dep_events.data()));
-
+    // FIXME: when not blocking pattern goes out of scope
     if (blocking) {
         ev->wait();
     }

From c76cdb2a741f6632237203f519ec0057729d67fb Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 28 Nov 2025 12:07:09 +0000
Subject: [PATCH 53/74] Fix memory

---
 .../intel_gpu/include/intel_gpu/runtime/memory.hpp       | 1 -
 src/plugins/intel_gpu/src/runtime/memory.cpp             | 9 ---------
 2 files changed, 10 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
index cdb15dbace42d7..9cd00b1c3065e5 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -224,7 +224,6 @@ struct surfaces_lock {
     surfaces_lock(const surfaces_lock& other) = delete;
     surfaces_lock& operator=(const surfaces_lock& other) = delete;
 
-    static std::unique_ptr<surfaces_lock> create(engine_types engine_type, std::vector<memory::ptr> mem, const stream& stream);
     static bool is_lock_needed(const shared_mem_type& mem_type);
 };
 
diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp
index 9718a9b0e35d24..46904030b6ee52 100644
--- a/src/plugins/intel_gpu/src/runtime/memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory.cpp
@@ -43,15 +43,6 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, std::
     : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), m_mem_tracker(mem_tracker), _type(type) {
 }
 
-std::unique_ptr<surfaces_lock> surfaces_lock::create(engine_types engine_type, std::vector<memory::ptr> mem, const stream& stream) {
-    switch (engine_type) {
-    case engine_types::sycl:
-    case engine_types::ocl:
-        return std::unique_ptr<ocl::ocl_surfaces_lock>(new ocl::ocl_surfaces_lock(mem, stream));
-    default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create");
-    }
-}
-
 bool surfaces_lock::is_lock_needed(const shared_mem_type& mem_type) {
     return mem_type == shared_mem_type::shared_mem_vasurface ||
            mem_type == shared_mem_type::shared_mem_dxbuffer ||

From 6473aa0684ad7bf869b264212f3b9c4f1830a47e Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 28 Nov 2025 14:08:53 +0000
Subject: [PATCH 54/74] Set runtime interface for kernel_selector

---
 src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
index 196f56acd4cc4a..fb82eea70f9d23 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -71,6 +71,7 @@ if (ENABLE_ONEDNN_FOR_GPU)
 endif()
 
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE openvino::runtime::dev)
+ov_gpu_set_runtime_interface_for(${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:rapidjson,INTERFACE_INCLUDE_DIRECTORIES>)
 

From 93bb04b43d62bd788c47ba62f9eb4bf89710f5cd Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 1 Dec 2025 17:15:08 +0100
Subject: [PATCH 55/74] Remove onednn ocl include

---
 .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
index 34c9f2a51f29c1..19fed9fec49402 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
@@ -7,7 +7,6 @@
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #    include <initializer_list>
 #    include <oneapi/dnnl/dnnl.hpp>
-#    include <oneapi/dnnl/dnnl_ocl.hpp>
 #    include <sstream>
 #    include <string_view>
 #    include <tuple>

From c621131e8a128ab7847b5128eaeb4b402a673176 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 2 Dec 2025 12:52:44 +0000
Subject: [PATCH 56/74] Fix backend compilation and remote tensor

---
 src/plugins/intel_gpu/src/graph/CMakeLists.txt       | 2 ++
 src/plugins/intel_gpu/src/plugin/remote_tensor.cpp   | 1 +
 src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 1 +
 src/plugins/intel_gpu/thirdparty/CMakeLists.txt      | 6 ------
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/CMakeLists.txt
index 3cbc144a296238..9abdd2705f4abd 100644
--- a/src/plugins/intel_gpu/src/graph/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/graph/CMakeLists.txt
@@ -84,6 +84,8 @@ macro(ov_gpu_add_backend_target)
         target_include_directories(${ARG_NAME} SYSTEM BEFORE PRIVATE $<TARGET_PROPERTY:onednn_gpu_tgt,INTERFACE_INCLUDE_DIRECTORIES>)
         add_dependencies(openvino_intel_gpu_${IMPL_TYPE}_obj onednn_gpu_tgt)
     endif()
+    # Onednn headers use OCL/L0 headers
+    ov_gpu_set_runtime_interface_for(openvino_intel_gpu_${IMPL_TYPE}_obj)
 endmacro()
 
 set(CODEGEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/common_utils/kernels_db_gen.py")
diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
index b62a95f5f2bd62..9faecbbb6842e4 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
@@ -322,6 +322,7 @@ void RemoteTensorImpl::allocate() {
             m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset);
         } else {
             // Fall back to usm_host and override memory type
+            GPU_DEBUG_COUT << "[Warning] [GPU] Could not allocate cl_mem, using usm_host allocation instead\n";
             m_mem_type = TensorType::BT_USM_HOST_INTERNAL;
             m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset);
         }
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index c7e1498a2e3d30..fcb6004458e940 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -386,6 +386,7 @@ bool does_device_support(int32_t param, const cl::Device& device) {
 
 memory_capabilities init_memory_caps(const cl::Device& device, const device_info& info) {
     std::vector<allocation_type> memory_caps;
+    memory_caps.push_back(allocation_type::cl_mem);
     if (info.supports_usm) {
         if (does_device_support(CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, device)) {
             memory_caps.push_back(allocation_type::usm_host);
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 916559e401be4c..d876d52f97baed 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -166,12 +166,6 @@ if(ENABLE_ONEDNN_FOR_GPU)
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config"
                              "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include"
                              "${ONEDNN_GPU_DIR}/third_party/ngen")
-        if(GPU_RUNTIME STREQUAL "L0")
-            # OneDNN LevelZero headers can be found at ${ONEDNN_GPU_DIR}\third_party\level_zero\
-            # However adding ${ONEDNN_GPU_DIR}\third_party to include dirs will clash with other dependencies like gtest
-            # Workaround: Use OpenVINO LevelZero target instead
-            list(APPEND LIB_INCLUDE_DIRS $<TARGET_PROPERTY:LevelZero::LevelZero,INTERFACE_INCLUDE_DIRECTORIES>)
-        endif()
         set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
                             DNNL_DLL
                             DNNL_DLL_EXPORTS

From 8c9e85581cf09bba37011cf9d31f64d97c55ae09 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 7 Jan 2026 14:47:39 +0000
Subject: [PATCH 57/74] Rename counter based events

---
 .../include/intel_gpu/runtime/device_info.hpp    |  2 +-
 .../intel_gpu/src/runtime/ocl/ocl_device.cpp     |  2 +-
 ...e_cb_event.cpp => ze_counter_based_event.cpp} | 16 ++++++++--------
 ...e_cb_event.hpp => ze_counter_based_event.hpp} | 13 ++++++-------
 ...ry.cpp => ze_counter_based_event_factory.cpp} | 10 +++++-----
 ...ry.hpp => ze_counter_based_event_factory.hpp} |  4 ++--
 .../intel_gpu/src/runtime/ze/ze_device.cpp       |  4 ++--
 .../intel_gpu/src/runtime/ze/ze_stream.cpp       |  6 +++---
 8 files changed, 28 insertions(+), 29 deletions(-)
 rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event.cpp => ze_counter_based_event.cpp} (78%)
 rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event.hpp => ze_counter_based_event.hpp} (73%)
 rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event_factory.cpp => ze_counter_based_event_factory.cpp} (75%)
 rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event_factory.hpp => ze_counter_based_event_factory.hpp} (70%)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index 6b6c0d3caf48f0..94e6ff2605a100 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -122,7 +122,7 @@ struct device_info {
     bool has_separate_cache;                    ///< Does the target hardware has separate cache for usm_device and usm_host
 
     bool supports_cp_offload;                   ///< [L0] Does the command queue support copy offload
-    bool supports_cb_events;                    ///< [L0] Does the target runtime support counter based events
+    bool supports_counter_based_events;                    ///< [L0] Does the target runtime support counter based events
 
     std::vector<size_t> supported_simd_sizes;   ///< List of SIMD sizes supported by current device and compiler
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index fcb6004458e940..d935b6ad900e54 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -352,7 +352,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
     info.compute_queue_group_ordinal = 0;
     info.device_memory_ordinal = 0;
     info.supports_cp_offload = false;
-    info.supports_cb_events = false;
+    info.supports_counter_based_events = false;
 
 #if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT)
     using namespace dnnl::impl::gpu::intel::jit;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp
similarity index 78%
rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
rename to src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp
index 3746c92fe00bbc..0ca4b0b243d2e0 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ze_cb_event.hpp"
+#include "ze_counter_based_event.hpp"
 #include "ze/ze_common.hpp"
 
 #include <cassert>
@@ -12,15 +12,15 @@
 using namespace cldnn;
 using namespace ze;
 
-void ze_cb_event::wait_impl() {
+void ze_counter_based_event::wait_impl() {
     OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout));
 }
 
-void ze_cb_event::set_impl() {
+void ze_counter_based_event::set_impl() {
     // Counter based events start in signaled state and can not be signaled from host
 }
 
-bool ze_cb_event::is_set_impl() {
+bool ze_counter_based_event::is_set_impl() {
     auto ret = zeEventQueryStatus(m_event);
     switch (ret) {
     case ZE_RESULT_SUCCESS:
@@ -35,11 +35,11 @@ bool ze_cb_event::is_set_impl() {
     }
 }
 
-ze_event_handle_t ze_cb_event::get_handle() const {
+ze_event_handle_t ze_counter_based_event::get_handle() const {
     return m_event;
 }
 
-std::optional<ze_kernel_timestamp_result_t> ze_cb_event::query_timestamp() {
+std::optional<ze_kernel_timestamp_result_t> ze_counter_based_event::query_timestamp() {
     if (!m_factory.is_profiling_enabled()) {
         return std::nullopt;
     }
@@ -48,7 +48,7 @@ std::optional<ze_kernel_timestamp_result_t> ze_cb_event::query_timestamp() {
     return timestamp;
 }
 
-bool ze_cb_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
+bool ze_counter_based_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
     auto opt_timestamp = query_timestamp();
     if (!opt_timestamp.has_value()) {
         return true;
@@ -67,6 +67,6 @@ bool ze_cb_event::get_profiling_info_impl(std::list<instrumentation::profiling_i
     return true;
 }
 
-ze_cb_event::~ze_cb_event() {
+ze_counter_based_event::~ze_counter_based_event() {
     OV_ZE_WARN(zeEventDestroy(m_event));
 }
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.hpp
similarity index 73%
rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp
rename to src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.hpp
index 08cc5ea34b8d9c..8c8131853c0b69 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.hpp
@@ -13,12 +13,12 @@ namespace ze {
 // L0 counter based event.
 // Signaled state is inferred from the number of tasks completed on device.
 // Resetting counter based event is not allowed.
-// Signaling counter based event from host is not allowed.
+// Start in signaled state and signaling from host is not allowed.
 // Can only be used with in-order command lists.
-struct ze_cb_event : public ze_base_event {
+struct ze_counter_based_event : public ze_base_event {
 public:
     // Take ownership of counter based event handle
-    ze_cb_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev)
+    ze_counter_based_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev)
     : ze_base_event(queue_stamp)
     , m_factory(factory)
     , m_event(ev) {
@@ -27,10 +27,9 @@ struct ze_cb_event : public ze_base_event {
             OPENVINO_THROW("[GPU] Trying to create event with null handle");
         }
     }
-    ze_cb_event(const ze_cb_event&) = delete;
-    ze_cb_event& operator=(const ze_cb_event&) = delete;
-    ~ze_cb_event();
-
+    ze_counter_based_event(const ze_counter_based_event&) = delete;
+    ze_counter_based_event& operator=(const ze_counter_based_event&) = delete;
+    ~ze_counter_based_event();
     void wait_impl() override;
     void set_impl() override;
     bool is_set_impl() override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp
similarity index 75%
rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
rename to src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp
index 1a7eaa4445a6fe..9e50d3bcc762b9 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp
@@ -2,9 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ze_cb_event_factory.hpp"
+#include "ze_counter_based_event_factory.hpp"
 #include "ze_common.hpp"
-#include "ze_cb_event.hpp"
+#include "ze_counter_based_event.hpp"
 
 #include "zex_event.h"
 
@@ -19,20 +19,20 @@ namespace {
     }
 }
 
-ze_cb_event_factory::ze_cb_event_factory(const ze_engine &engine, bool enable_profiling)
+ze_counter_based_event_factory::ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling)
     : ze_base_event_factory(engine, enable_profiling) {
     if (func_zexCounterBasedEventCreate2 == nullptr) {
         find_function_address(engine.get_driver());
     }
 }
 
-event::ptr ze_cb_event_factory::create_event(uint64_t queue_stamp) {
+event::ptr ze_counter_based_event_factory::create_event(uint64_t queue_stamp) {
     ze_event_handle_t event;
     auto desc = defaultIntelCounterBasedEventDesc;
     if (is_profiling_enabled()) {
         desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP;
     }
     OV_ZE_EXPECT(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event));
-    auto cb_event = std::make_shared<ze_cb_event>(queue_stamp, *this, event);
+    auto cb_event = std::make_shared<ze_counter_based_event>(queue_stamp, *this, event);
     return cb_event;
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp
similarity index 70%
rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp
rename to src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp
index 0c6bceaacbd57b..90e7af9562628a 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp
@@ -11,9 +11,9 @@ namespace ze {
 
 // Interface for creating l0 counter based events
 // Should only be used with in-order queue
-struct ze_cb_event_factory : public ze_base_event_factory {
+struct ze_counter_based_event_factory : public ze_base_event_factory {
 public:
-    ze_cb_event_factory(const ze_engine &engine, bool enable_profiling);
+    ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling);
     event::ptr create_event(uint64_t queue_stamp) override;
 };
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
index ef09a71950d266..e6f965d6416929 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp
@@ -103,7 +103,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0);
     bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0);
     bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0);
-    bool supports_cb_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0);
+    bool supports_counter_based_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0);
     bool supports_cp_offload =
         supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0);
     bool supports_dp_properties =
@@ -208,7 +208,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic
     info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0;
 
     info.supports_cp_offload = supports_cp_offload;
-    info.supports_cb_events = supports_cb_events;
+    info.supports_counter_based_events = supports_counter_based_events;
 
     info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0;
     info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 920c0ac1aa0c83..98bc5358f8a417 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -9,7 +9,7 @@
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/properties.hpp"
 
-#include "ze_cb_event_factory.hpp"
+#include "ze_counter_based_event_factory.hpp"
 #include "ze_event_factory.hpp"
 #include "ze_events.hpp"
 #include "ze_empty_event.hpp"
@@ -213,8 +213,8 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     }
 
     OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
-    if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) {
-        m_ev_factory = std::make_unique<ze_cb_event_factory>(engine, config.get_enable_profiling());
+    if (m_queue_type == QueueTypes::in_order && info.supports_counter_based_events) {
+        m_ev_factory = std::make_unique<ze_counter_based_event_factory>(engine, config.get_enable_profiling());
     } else {
         m_ev_factory = std::make_unique<ze_event_factory>(engine, config.get_enable_profiling());
     }

From d377c4523c6bdbbbf867edff090d33029de8ab8c Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 7 Jan 2026 14:58:44 +0000
Subject: [PATCH 58/74] Adjust event set_profiling_duration

---
 src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp   | 6 +++---
 .../src/graph/impls/onednn/primitive_onednn_base.h          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp
index 5451cd47c7daf2..24465c4f2d0569 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp
@@ -11,7 +11,6 @@
 #include <utility>
 #include <utility>
 #include <functional>
-#include <optional>
 
 namespace cldnn {
 struct user_event;
@@ -33,8 +32,9 @@ struct event {
         _profiling_captured = false;
         _profiling_info.clear();
     }
-    // Set event profiling data instead of retrieving it from event object
-    void set_profiling(uint64_t duration_nsec) {
+    /// @brief Set event profiling data instead of retrieving it from event object
+    /// @param duration_nsec duration in nanoseconds
+    void set_profiling_duration(uint64_t duration_nsec) {
         auto stage = instrumentation::profiling_stage::executing;
         auto duration = std::chrono::nanoseconds(duration_nsec);
         auto period = std::make_shared<instrumentation::profiling_period_basic>(duration);
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
index d223062f2d7f00..33d87ea1c394db 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@@ -555,7 +555,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
                 if (!duration.empty()) {
                     OPENVINO_ASSERT(duration.size() == 1, "[GPU] oneDNN profiling data is expected to have info only for single primitive ",
                                                       "actual number is ", duration.size());
-                    event->set_profiling(duration[0]);
+                    event->set_profiling_duration(duration[0]);
                 }
 
             } else {

From ca7f6cf519260f25bdeafb7a9985e592de39962d Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 7 Jan 2026 16:03:23 +0000
Subject: [PATCH 59/74] Remove unnecessary engine parameter

---
 .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp        | 4 ++--
 src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp   | 2 +-
 src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp   | 2 +-
 .../intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp   | 4 ++--
 src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp  | 4 ++--
 .../intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp   | 4 ++--
 src/plugins/intel_gpu/src/graph/include/primitive_inst.h      | 2 +-
 src/plugins/intel_gpu/src/graph/program.cpp                   | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
index b83b594949a882..0f583a849fb754 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
@@ -62,8 +62,8 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
-        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0], e));
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
+        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0]));
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
index 27aa2ce27446bd..a1f5c0f1ac1b63 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp
@@ -381,7 +381,7 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co
     }
 }
 
-kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const engine& e) const {
+kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const {
     auto res = _cached_kernels.find(id);
     OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
index ee65b68be0138c..3b6a5cf78032dd 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp
@@ -123,7 +123,7 @@ class kernels_cache {
                            uint32_t prog_id,
                            std::shared_ptr<ov::threading::ITaskExecutor> task_executor = nullptr,
                            const std::map<std::string, std::string>& batch_headers = {});
-    kernel::ptr get_kernel_from_cached_kernels(std::string id, const engine& e) const;
+    kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
     std::vector<kernel::ptr> get_kernels(const kernel_impl_params& params) const;
 
     void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
index 3da69b2aa48bfc..71e8571467d322 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
@@ -126,12 +126,12 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
         _kernels.clear();
 
         _kernels.reserve(cached_kernel_ids.size());
         for (size_t k = 0; k < cached_kernel_ids.size(); ++k) {
-            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e));
+            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k]));
         }
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
index 9d93413d8016e8..d604d5a8a3467a 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -149,12 +149,12 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
 
-    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const engine& e) override {
+    void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
         _kernels.clear();
 
         _kernels.reserve(cached_kernel_ids.size());
         for (size_t k = 0; k < cached_kernel_ids.size(); ++k) {
-            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e));
+            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k]));
         }
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
index 1f616ab80d9b08..f9756be883dc3a 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp
@@ -145,10 +145,10 @@ struct PrimitiveImplOCL : public cldnn::primitive_impl {
         }
     }
 
-    void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids, const cldnn::engine& e) override {
+    void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
         OPENVINO_ASSERT(cached_kernel_ids.size() == _order.size());
         for (size_t i = 0; i < cached_kernel_ids.size(); ++i) {
-            _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i], e);
+            _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i]);
         }
         this->can_share_kernels = kernels_cache.get_kernels_reuse();
     }
diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
index 249a78ba10d6ef..aeba603982476d 100644
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -79,7 +79,7 @@ struct primitive_impl {
     virtual bool is_cpu() const { return true; }
     virtual bool is_onednn() const { return false; }
     virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
-    virtual void init_by_cached_kernels(const kernels_cache&, std::vector<std::string>& cached_kernel_ids, const engine& e) {}
+    virtual void init_by_cached_kernels(const kernels_cache&, std::vector<std::string>& cached_kernel_ids) {}
     virtual std::vector<std::string> get_cached_kernel_ids(const kernels_cache&) { return {}; }
     virtual std::unique_ptr<primitive_impl> clone() const = 0;
     virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index a9699ac1ad9fd9..f9bf639a645cbc 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -2043,7 +2043,7 @@ void program::load(cldnn::BinaryInputBuffer& ib,
 
             std::vector<std::string> cached_kernel_ids;
             ib >> cached_kernel_ids;
-            p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids, _engine);
+            p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids);
         }
     }
 

From 674556cf2f28be9655d419a84426749a3dfd1eb4 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 8 Jan 2026 10:15:00 +0000
Subject: [PATCH 60/74] Update L0 OneDNN submodule

---
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
index 5d3bb07d57b485..867c4222ffd8f7 160000
--- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -1 +1 @@
-Subproject commit 5d3bb07d57b4853d3d9643a11117a4d7c5a380f2
+Subproject commit 867c4222ffd8f7fde75f2e3c9340fbcdca951c54

From 863ea10e74be425aaaa7ff35d5b7e210964ba651 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 8 Jan 2026 12:18:55 +0000
Subject: [PATCH 61/74] Update onednn L0 submodule and fix profiling

---
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 +-
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 98bc5358f8a417..76b3268ef995bb 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -397,7 +397,7 @@ dnnl::stream& ze_stream::get_onednn_stream() {
     OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue");
     OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices");
     if (!_onednn_stream) {
-        _onednn_stream = std::make_shared<dnnl::stream>(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list));
+        _onednn_stream = std::make_shared<dnnl::stream>(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list, m_ev_factory->is_profiling_enabled()));
     }
 
     return *_onednn_stream;
diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
index 867c4222ffd8f7..e7663948687f11 160000
--- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -1 +1 @@
-Subproject commit 867c4222ffd8f7fde75f2e3c9340fbcdca951c54
+Subproject commit e7663948687f1192ad87444fb20e1f5210ccc1a8

From 0cdb6f5415b7ca324a25a2455bca2b9ef7eb8e2d Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 8 Jan 2026 15:07:18 +0000
Subject: [PATCH 62/74] Restore cpu checks in primitive_base

---
 .../intel_gpu/src/graph/impls/ocl/primitive_base.hpp   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
index d604d5a8a3467a..8ff102848b0dd4 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -136,6 +136,9 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
+        if (is_cpu()) {
+            return;
+        }
         _kernels.clear();
         if (!_kernel_data.kernels.empty()) {
             auto compiled_kernels = kernels_cache.get_kernels(params);
@@ -150,6 +153,9 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector<std::string>& cached_kernel_ids) override {
+        if (is_cpu()) {
+            return;
+        }
         _kernels.clear();
 
         _kernels.reserve(cached_kernel_ids.size());
@@ -191,7 +197,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
-        if (instance.can_be_optimized()) {
+        if (instance.can_be_optimized() || is_cpu()) {
             return;
         }
 
@@ -295,6 +301,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     }
 
     void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
+        if (is_cpu())
+            return;
         OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed.");
         auto& kernel_vec = kernels.begin()->second;
         _kernels.clear();

From 86c81b442bc1a7d9b2b0e4675d84d1ca29a22b87 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 8 Jan 2026 16:03:43 +0000
Subject: [PATCH 63/74] Reuse check_allocatable for ocl and l0

---
 .../include/intel_gpu/runtime/engine.hpp      |  2 +-
 src/plugins/intel_gpu/src/runtime/engine.cpp  | 45 +++++++++++++++++++
 .../intel_gpu/src/runtime/ocl/ocl_engine.cpp  | 45 -------------------
 .../intel_gpu/src/runtime/ocl/ocl_engine.hpp  |  1 -
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    | 44 ------------------
 .../intel_gpu/src/runtime/ze/ze_engine.hpp    |  1 -
 6 files changed, 46 insertions(+), 92 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
index 3e61a6e428b4cd..a307eedff4a35b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -84,7 +84,7 @@ class engine {
     /// Checks whether two memory objects represents the same physical memory
     virtual bool is_the_same_buffer(const memory& mem1, const memory& mem2) = 0;
 
-    virtual bool check_allocatable(const layout& layout, allocation_type type) = 0;
+    virtual bool check_allocatable(const layout& layout, allocation_type type);
 
     /// Returns basic allocation type which will be used as a fallback when allocation type is not specified or device doesn't support some features.
     virtual allocation_type get_default_allocation_type() const = 0;
diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp
index f35d4d577230f4..a4165e1f50ef45 100644
--- a/src/plugins/intel_gpu/src/runtime/engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/engine.cpp
@@ -294,4 +294,49 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_
     return engine::create(engine_type, runtime_type, device);
 }
 
+bool engine::check_allocatable(const layout& layout, allocation_type type) {
+    OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type);
+
+    if (!get_enable_large_allocations()) {
+        bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
+
+        // When dynamic shape upper bound makes bigger buffer, then return false.
+        if (exceed_allocatable_mem_size && layout.is_dynamic()) {
+            OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
+            return false;
+        }
+
+        OPENVINO_ASSERT(!exceed_allocatable_mem_size,
+                        "[GPU] Exceeded max size of memory object allocation: ",
+                        "requested ", layout.bytes_count(), " bytes, "
+                        "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
+                        "Please try to reduce batch size or use lower precision.");
+    }
+
+    auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host);
+    auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size());
+
+    // When dynamic shape upper bound makes bigger buffer, then return false.
+    if (exceed_available_mem_size && layout.is_dynamic()) {
+        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
+        return false;
+    }
+
+#ifdef __unix__
+    // Prevent from being killed by Ooo Killer of Linux
+    OPENVINO_ASSERT(!exceed_available_mem_size,
+                    "[GPU] Exceeded max size of memory allocation: ",
+                    "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ",
+                    "but available memory size is ", get_max_memory_size(), " bytes");
+#else
+    if (exceed_available_mem_size) {
+        GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : "
+                       << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl;
+        GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl;
+    }
+#endif
+
+    return true;
+}
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
index b3a5c2266f165e..6ea05d592a5d7c 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -101,51 +101,6 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const
                                        : allocation_type::unknown;
 }
 
-bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) {
-    OPENVINO_ASSERT(supports_allocation(type) || type == allocation_type::cl_mem, "[GPU] Unsupported allocation type: ", type);
-
-    if (!get_enable_large_allocations()) {
-        bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
-
-        // When dynamic shape upper bound makes bigger buffer, then return false.
-        if (exceed_allocatable_mem_size && layout.is_dynamic()) {
-            OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
-            return false;
-        }
-
-        OPENVINO_ASSERT(!exceed_allocatable_mem_size,
-                        "[GPU] Exceeded max size of memory object allocation: ",
-                        "requested ", layout.bytes_count(), " bytes, "
-                        "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
-                        "Please try to reduce batch size or use lower precision.");
-    }
-
-    auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host);
-    auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size());
-
-    // When dynamic shape upper bound makes bigger buffer, then return false.
-    if (exceed_available_mem_size && layout.is_dynamic()) {
-        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
-        return false;
-    }
-
-#ifdef __unix__
-    // Prevent from being killed by Ooo Killer of Linux
-    OPENVINO_ASSERT(!exceed_available_mem_size,
-                    "[GPU] Exceeded max size of memory allocation: ",
-                    "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ",
-                    "but available memory size is ", get_max_memory_size(), " bytes");
-#else
-    if (exceed_available_mem_size) {
-        GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : "
-                       << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl;
-        GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl;
-    }
-#endif
-
-    return true;
-}
-
 memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
     OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout");
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
index 84908c7addc969..e6f1b9010e2d51 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
@@ -29,7 +29,6 @@ class ocl_engine : public engine {
     memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override;
     memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
     bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
-    bool check_allocatable(const layout& layout, allocation_type type) override;
 
     void* get_user_context() const override;
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index 6b64c617560ce1..6e3b879091fad6 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -74,50 +74,6 @@ allocation_type ze_engine::detect_usm_allocation_type(const void* memory) const
     return ze::gpu_usm::detect_allocation_type(this, memory);
 }
 
-bool ze_engine::check_allocatable(const layout& layout, allocation_type type) {
-    OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type);
-
-    bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size);
-
-    // When dynamic shape upper bound makes bigger buffer, then return false.
-    if (exceed_allocatable_mem_size && layout.is_dynamic()) {
-        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
-        return false;
-    }
-
-    OPENVINO_ASSERT(!exceed_allocatable_mem_size,
-                    "[GPU] Exceeded max size of memory object allocation: ",
-                    "requested ", layout.bytes_count(), " bytes, "
-                    "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.",
-                    "Please try to reduce batch size or use lower precision.");
-
-    auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host);
-    auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size());
-
-    // When dynamic shape upper bound makes bigger buffer, then return false.
-    if (exceed_available_mem_size && layout.is_dynamic()) {
-        OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate");
-        return false;
-    }
-
-#ifdef __unix__
-    // Prevent from being killed by Ooo Killer of Linux
-    OPENVINO_ASSERT(!exceed_available_mem_size,
-                    "[GPU] Exceeded max size of memory allocation: ",
-                    "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ",
-                    "but available memory size is ", get_max_memory_size(), " bytes");
-#else
-    if (exceed_available_mem_size) {
-        GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : "
-                       << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl;
-        GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl;
-        return false;
-    }
-#endif
-
-    return true;
-}
-
 memory::ptr ze_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
     OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout");
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
index 586b830a2de93e..3c2cd4ca041f6c 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
@@ -26,7 +26,6 @@ class ze_engine : public engine {
     memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) override;
     memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override;
     bool is_the_same_buffer(const memory& mem1, const memory& mem2) override;
-    bool check_allocatable(const layout& layout, allocation_type type) override;
 
     void* get_user_context() const override;
 

From 8621d68a57dba5490b1ab2498f4c7b1e858677b7 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Thu, 8 Jan 2026 16:34:48 +0000
Subject: [PATCH 64/74] Use asserts instead of throw

---
 .../intel_gpu/src/runtime/ze/ze_engine.cpp    | 21 ++++++++-----------
 .../intel_gpu/src/runtime/ze/ze_event.hpp     |  4 +---
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index 6e3b879091fad6..25ff4b9972719e 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -132,18 +132,15 @@ memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_l
     if (new_layout.format.is_image_2d()) {
         OPENVINO_NOT_IMPLEMENTED;
     }
-    if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
-        auto& new_buf = reinterpret_cast<const ze::gpu_usm&>(memory);
-        auto ptr = new_buf.get_buffer().get();
-        auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset);
-        return std::make_shared<ze::gpu_usm>(this,
-                                 new_layout,
-                                 sub_buffer,
-                                 memory.get_allocation_type(),
-                                 memory.get_mem_tracker());
-    } else {
-        OPENVINO_THROW("[GPU] Trying to create subbuffer for non usm memory");
-    }
+    OPENVINO_ASSERT(!memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory");
+    auto& new_buf = reinterpret_cast<const ze::gpu_usm&>(memory);
+    auto ptr = new_buf.get_buffer().get();
+    auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset);
+    return std::make_shared<ze::gpu_usm>(this,
+                             new_layout,
+                             sub_buffer,
+                             memory.get_allocation_type(),
+                             memory.get_mem_tracker());
 }
 
 bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
index 4c7e02ed32ee08..676208490f92d1 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp
@@ -20,9 +20,7 @@ struct ze_event : public ze_base_event {
         , m_factory(factory)
         , m_event(ev) {
             // Ensure event handle is not null
-            if (ev == nullptr) {
-                OPENVINO_THROW("[GPU] Trying to create event with null handle");
-            }
+            OPENVINO_ASSERT(ev != nullptr, "[GPU] Trying to create event with null handle");
         }
     ze_event(const ze_event &) = delete;
     ze_event& operator=(const ze_event &) = delete;

From 8d0b5bf2464b023c2636e298c623e9df01d66d56 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 9 Jan 2026 13:47:09 +0000
Subject: [PATCH 65/74] Fix get_user_context

---
 src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index 25ff4b9972719e..fdeaef71f87fb1 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -162,7 +162,7 @@ std::shared_ptr<kernel_builder> ze_engine::create_kernel_builder() const {
 
 void* ze_engine::get_user_context() const {
     auto& casted = downcast<ze_device>(*_device);
-    return static_cast<void*>(casted.get_driver());
+    return static_cast<void*>(casted.get_context());
 }
 
 stream::ptr ze_engine::create_stream(const ExecutionConfig& config) const {

From d64e6b67008e9678ed2ed835f03ddf9aee07cd3a Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 9 Jan 2026 14:39:57 +0000
Subject: [PATCH 66/74] Move common code to engine class

---
 .../intel_gpu/include/intel_gpu/runtime/engine.hpp    | 10 ++++++++--
 src/plugins/intel_gpu/src/runtime/engine.cpp          | 11 +++++++++++
 src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp  |  9 ---------
 src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp  |  9 ---------
 src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp    |  9 ---------
 src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp    | 11 -----------
 6 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
index a307eedff4a35b..fa57dd19af6f59 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -142,7 +142,7 @@ class engine {
     virtual stream_ptr create_stream(const ExecutionConfig& config, void *handle) const = 0;
 
     /// Returns service stream which can be used during program build and optimizations
-    virtual stream& get_service_stream() const = 0;
+    virtual stream& get_service_stream() const;
 
     virtual std::shared_ptr<kernel_builder> create_kernel_builder() const = 0;
 
@@ -157,7 +157,7 @@ class engine {
     virtual void create_onednn_engine(const ExecutionConfig& config) = 0;
 
     /// Returns onednn engine object which shares device and context with current engine
-    virtual dnnl::engine& get_onednn_engine() const = 0;
+    virtual dnnl::engine& get_onednn_engine() const;
 #endif
 
     /// Factory method which creates engine object with impl configured by @p engine_type
@@ -177,6 +177,12 @@ class engine {
     engine(const device::ptr device);
     const device::ptr _device;
     bool enable_large_allocations = false;
+    std::unique_ptr<stream> _service_stream;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    std::mutex onednn_mutex;
+    std::shared_ptr<dnnl::engine> _onednn_engine;
+#endif
 
     std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _memory_usage_data{};
     std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _peak_memory_usage_data{};
diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp
index a4165e1f50ef45..959fb414a51c68 100644
--- a/src/plugins/intel_gpu/src/runtime/engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/engine.cpp
@@ -339,4 +339,15 @@ bool engine::check_allocatable(const layout& layout, allocation_type type) {
     return true;
 }
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+dnnl::engine& engine::get_onednn_engine() const {
+    OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called");
+    return *_onednn_engine;
+}
+#endif
+
+stream& engine::get_service_stream() const {
+    return *_service_stream;
+}
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
index 6ea05d592a5d7c..6e45fdd788a109 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -71,11 +71,6 @@ void ocl_engine::create_onednn_engine(const ExecutionConfig& config) {
 #endif
     }
 }
-
-dnnl::engine& ocl_engine::get_onednn_engine() const {
-    OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called");
-    return *_onednn_engine;
-}
 #endif
 
 const cl::Context& ocl_engine::get_cl_context() const {
@@ -281,10 +276,6 @@ stream::ptr ocl_engine::create_stream(const ExecutionConfig& config, void* handl
     return std::make_shared<ocl_stream>(*this, config, handle);
 }
 
-stream& ocl_engine::get_service_stream() const {
-    return *_service_stream;
-}
-
 std::shared_ptr<cldnn::engine> ocl_engine::create(const device::ptr device, runtime_types runtime_type) {
     return std::make_shared<ocl::ocl_engine>(device, runtime_type);
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
index e6f1b9010e2d51..e9d152e20fd88c 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp
@@ -43,26 +43,17 @@ class ocl_engine : public engine {
 
     stream_ptr create_stream(const ExecutionConfig& config) const override;
     stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
-    stream& get_service_stream() const override;
 
     std::shared_ptr<kernel_builder> create_kernel_builder() const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     void create_onednn_engine(const ExecutionConfig& config) override;
-    // Returns onednn engine object which shares device and context with current engine
-    dnnl::engine& get_onednn_engine() const override;
 #endif
 
     static std::shared_ptr<cldnn::engine> create(const device::ptr device, runtime_types runtime_type);
 
 private:
     std::string _extensions;
-    std::unique_ptr<stream> _service_stream;
-
-#ifdef ENABLE_ONEDNN_FOR_GPU
-    std::mutex onednn_mutex;
-    std::shared_ptr<dnnl::engine> _onednn_engine;
-#endif
 };
 
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index fdeaef71f87fb1..d00750696e2c06 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -45,11 +45,6 @@ void ze_engine::create_onednn_engine(const ExecutionConfig& config) {
         _onednn_engine = std::make_shared<dnnl::engine>(dnnl::l0_interop::make_engine(casted->get_driver(), casted->get_device(), casted->get_context()));
     }
 }
-
-dnnl::engine& ze_engine::get_onednn_engine() const {
-    OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called");
-    return *_onednn_engine;
-}
 #endif
 
 const ze_driver_handle_t ze_engine::get_driver() const {
@@ -173,10 +168,6 @@ stream::ptr ze_engine::create_stream(const ExecutionConfig& config, void* handle
     OPENVINO_NOT_IMPLEMENTED;
 }
 
-stream& ze_engine::get_service_stream() const {
-    return *_service_stream;
-}
-
 std::shared_ptr<cldnn::engine> ze_engine::create(const device::ptr device, runtime_types runtime_type) {
     return std::make_shared<ze::ze_engine>(device, runtime_type);
 }
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
index 3c2cd4ca041f6c..24d0736b3fbbfc 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp
@@ -38,25 +38,14 @@ class ze_engine : public engine {
 
     stream_ptr create_stream(const ExecutionConfig& config) const override;
     stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override;
-    stream& get_service_stream() const override;
 
     std::shared_ptr<kernel_builder> create_kernel_builder() const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
     void create_onednn_engine(const ExecutionConfig& config) override;
-    // Returns onednn engine object which shares device and context with current engine
-    dnnl::engine& get_onednn_engine() const override;
 #endif
 
     static std::shared_ptr<cldnn::engine> create(const device::ptr device, runtime_types runtime_type);
-
-private:
-    std::unique_ptr<stream> _service_stream;
-
-#ifdef ENABLE_ONEDNN_FOR_GPU
-    std::mutex onednn_mutex;
-    std::shared_ptr<dnnl::engine> _onednn_engine;
-#endif
 };
 
 }  // namespace ze

From 5ccd52c87df872021c14638b74f42576e92b6764 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 9 Jan 2026 15:16:35 +0000
Subject: [PATCH 67/74] Rename L0 timeout

---
 src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp              | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp               | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp              | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp              | 2 +-
 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp              | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
index 4ff1100a7dd9e9..d7456d74c7459f 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp
@@ -32,7 +32,7 @@
 namespace cldnn {
 namespace ze {
 
-static constexpr uint64_t default_timeout = std::numeric_limits<uint64_t>::max();
+static constexpr uint64_t endless_wait = std::numeric_limits<uint64_t>::max();
 static constexpr ze_module_format_t ze_module_format_oclc = (ze_module_format_t) 3U;
 
 }  // namespace ze
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp
index 0ca4b0b243d2e0..03e342d0f571ad 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp
@@ -13,7 +13,7 @@ using namespace cldnn;
 using namespace ze;
 
 void ze_counter_based_event::wait_impl() {
-    OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout));
+    OV_ZE_EXPECT(zeEventHostSynchronize(m_event, endless_wait));
 }
 
 void ze_counter_based_event::set_impl() {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
index 7d99b06aa619b6..6fb52b9cac2d30 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp
@@ -18,7 +18,7 @@ void ze_event::reset() {
 }
 
 void ze_event::wait_impl() {
-    OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout));
+    OV_ZE_EXPECT(zeEventHostSynchronize(m_event, endless_wait));
 }
 
 void ze_event::set_impl() {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
index e54348e7b12c25..852fecf893efa8 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
@@ -14,7 +14,7 @@ using namespace ze;
 
 void ze_events::wait_impl() {
     if (m_last_event) {
-        OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, default_timeout));
+        OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, endless_wait));
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index d86d1fe3db8fc2..4a7bdcface6c4b 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -118,7 +118,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re
                                     nullptr,
                                     0,
                                     nullptr));
-            OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout));
+            OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), endless_wait));
             _mapped_ptr = _host_buffer.get();
         } else {
             _mapped_ptr = _buffer.get();
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index 76b3268ef995bb..ab0e9991406ce1 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -344,7 +344,7 @@ void ze_stream::flush() const {
 }
 
 void ze_stream::finish() const {
-    OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, default_timeout));
+    OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, endless_wait));
 }
 
 void ze_stream::wait_for_events(const std::vector<event::ptr>& events) {

From 786fbb6e237e9f6f15bf1ddeca56fdd8accdadab Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Mon, 12 Jan 2026 18:18:44 +0000
Subject: [PATCH 68/74] Fix test build errors

---
 .../include/intel_gpu/runtime/kernel.hpp      |  4 ++++
 .../intel_gpu/src/runtime/ocl/ocl_kernel.hpp  |  7 +++++++
 .../intel_gpu/src/runtime/ze/ze_kernel.hpp    |  8 ++++++++
 .../intel_gpu/tests/functional/CMakeLists.txt |  2 --
 .../ze_remote_tensor_tests.cpp                |  2 --
 .../functional/subgraph_tests/serialize.cpp   |  2 +-
 .../functional/subgraph_tests/vlsdpa.cpp      |  1 -
 .../intel_gpu/tests/unit/CMakeLists.txt       | 19 ++++++++++++++++++-
 .../tests/unit/passes/kernels_cache_test.cpp  | 15 ++-------------
 9 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
index 54bece374b99f2..be273cd1d50aa7 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp
@@ -16,6 +16,10 @@ class kernel {
 public:
     using ptr = std::shared_ptr<kernel>;
     virtual std::shared_ptr<kernel> clone(bool reuse_kernel_handle = false) const = 0;
+    /// @brief Check if objects share the same handle to the kernel instance
+    /// @param other kernel object for comparison
+    /// @return true if underlying kernel handles are the same, false otherwise
+    virtual bool is_same(const kernel &other) const = 0;
     virtual ~kernel() = default;
 
     virtual std::string get_id() const = 0;
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
index d46fda1ee02f15..446fe51a7193c9 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp
@@ -33,6 +33,13 @@ class ocl_kernel : public kernel {
 
         return std::make_shared<ocl_kernel>(get_handle().clone(), _kernel_id);
     }
+    virtual bool is_same(const kernel &other) const {
+        auto other_ptr = dynamic_cast<const ocl_kernel*>(&other);
+        if (other_ptr == nullptr) {
+            return false;
+        }
+        return get_handle().get() == other_ptr->get_handle().get();
+    }
 
     std::vector<uint8_t> get_binary() const override;
     std::string get_build_log() const override;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
index 3a726e24930d24..9d57a571907a89 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp
@@ -67,6 +67,14 @@ class ze_kernel : public kernel {
         }
     }
 
+    virtual bool is_same(const kernel &other) const override {
+        auto other_ptr = dynamic_cast<const ze_kernel*>(&other);
+        if (other_ptr == nullptr) {
+            return false;
+        }
+        return get_kernel_handle() == other_ptr->get_kernel_handle();
+    }
+
     std::vector<uint8_t> get_binary() const override {
         size_t binary_size = 0;
         ze_module_handle_t module_handle = get_module_handle();
diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
index 9376bce54ce98e..3dd2da44dc6d41 100644
--- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt
@@ -32,8 +32,6 @@ ov_add_test_target(
         LINK_LIBRARIES
             openvino::reference
             funcSharedTests
-            OpenCL::NewHeaders # should come before OpenCL::OpenCL
-            OpenCL::OpenCL
         LABELS
             OV GPU
 )
diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
index f1a0b17b85e565..b593552272d691 100644
--- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
+++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp
@@ -7,13 +7,11 @@
 #include "openvino/runtime/intel_gpu/remote_properties.hpp"
 #include "openvino/runtime/remote_tensor.hpp"
 
-#include "remote_tensor_tests/helpers.hpp"
 #include "shared_test_classes/base/ov_behavior_test_utils.hpp"
 
 TEST(ZeRemoteContext, smoke_CorrectContextType) {
     auto core = ov::Core();
     auto remote_context = core.get_default_context(ov::test::utils::DEVICE_GPU);
-    ASSERT_FALSE(remote_context.is<ov::intel_gpu::ocl::ClContext>());
     ASSERT_EQ(remote_context.get_params().at(ov::intel_gpu::context_type.name()), ov::intel_gpu::ContextType::ZE);
 }
 
diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp
index 09b5052df9dc07..3ca4052b22d01d 100644
--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp
@@ -46,7 +46,7 @@ class LSTMSequenceTest : virtual public SerializeBaseTest {
 
 class GRUSequenceTest : virtual public SerializeBaseTest {
 public:
-    void SetUp() {
+    void SetUp() override {
         std::string cacheDirName = "cache_gru";
         auto init_shape = ov::PartialShape({1, 30, 512});
         auto batch_size = static_cast<size_t>(init_shape[0].get_length());
diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp
index 38be79bcb5391e..f2fae883a0f718 100644
--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp
@@ -11,7 +11,6 @@
 #include "shared_test_classes/base/ov_subgraph.hpp"
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/engine_configuration.hpp"
-#include "openvino/runtime/intel_gpu/ocl/ocl.hpp"
 
 #include "openvino/opsets/opset13.hpp"
 #include "ov_ops/vl_sdpa.hpp"
diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
index fd04fa1d989d0d..8b56fedf4d4a06 100644
--- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
@@ -34,6 +34,24 @@ file(GLOB_RECURSE SOURCES_MAIN
     "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/simple_math.cpp"
   )
 
+# Those tests have dependency on OpenCL runtime
+# Need to be excluded from the build with a different runtime
+file(GLOB_RECURSE SOURCES_WITH_OCL_RT
+    "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/device_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/engine_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/events_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/network_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/usm_memory_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/convert_color_gpu_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/cl_mem_input_test.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/mem_perf_test.cpp"
+  )
+if (NOT GPU_RT_TYPE STREQUAL "OCL")
+    foreach (SOURCE_FILE IN LISTS SOURCES_WITH_OCL_RT)
+        list (REMOVE_ITEM SOURCES_MAIN ${SOURCE_FILE})
+    endforeach()
+endif()
+
 if (NOT ENABLE_ONEDNN_FOR_GPU)
     set(EXCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn/")
     foreach (SOURCE_FILE IN LISTS SOURCES_MAIN)
@@ -70,7 +88,6 @@ endif()
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 
 target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph
-                                             OpenCL::OpenCL
                                              gtest
                                              gtest_main
                                              gflags
diff --git a/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp
index 30225132c35488..871038a6c17f31 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp
@@ -4,7 +4,6 @@
 
 #include "test_utils.h"
 
-#include "runtime/ocl/ocl_kernel.hpp"
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/graph/program.hpp"
 #include "intel_gpu/graph/network.hpp"
@@ -181,12 +180,7 @@ TEST(kernels_cache, reuse_kernels_property) {
         auto conv1_kern = cache.get_cached_kernel_id(conv1_kernels[idx]);
         auto conv2_kern = cache.get_cached_kernel_id(conv2_kernels[idx]);
         ASSERT_EQ(conv1_kern, conv2_kern);
-
-        auto conv1_ocl_kernel = std::dynamic_pointer_cast<ocl::ocl_kernel>(conv1_kernels[idx]);
-        auto conv2_ocl_kernel = std::dynamic_pointer_cast<ocl::ocl_kernel>(conv2_kernels[idx]);
-        if (conv1_ocl_kernel && conv2_ocl_kernel) {
-            ASSERT_EQ(conv1_ocl_kernel->get_handle().get(), conv2_ocl_kernel->get_handle().get());
-        }
+        ASSERT_TRUE(conv1_kernels[idx]->is_same(*conv2_kernels[idx].get()));
     }
 
     auto& concat1_node = prog->get_node("concat1");
@@ -200,11 +194,6 @@ TEST(kernels_cache, reuse_kernels_property) {
         auto concat1_kern = cache.get_cached_kernel_id(concat1_kernels[idx]);
         auto concat2_kern = cache.get_cached_kernel_id(concat2_kernels[idx]);
         ASSERT_EQ(concat1_kern, concat2_kern);
-
-        auto concat1_ocl_kernel = std::dynamic_pointer_cast<ocl::ocl_kernel>(concat1_kernels[idx]);
-        auto concat2_ocl_kernel = std::dynamic_pointer_cast<ocl::ocl_kernel>(concat2_kernels[idx]);
-        if (concat1_ocl_kernel && concat2_ocl_kernel) {
-            ASSERT_EQ(concat1_ocl_kernel->get_handle().get(), concat2_ocl_kernel->get_handle().get());
-        }
+        ASSERT_TRUE(concat1_kernels[idx]->is_same(*concat2_kernels[idx].get()));
     }
 }

From 46905789b039e9d64a53f2960fa29591d589dffa Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 13 Jan 2026 11:06:33 +0000
Subject: [PATCH 69/74] Update L0 OneDNN submodule

---
 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
index e7663948687f11..d28c5b4d0dd906 160000
--- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
+++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu
@@ -1 +1 @@
-Subproject commit e7663948687f1192ad87444fb20e1f5210ccc1a8
+Subproject commit d28c5b4d0dd90669cd63ff325871d7c87e4a5cfb

From 206a580cc908a8f99013558bdc31c5da9a93f011 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 13 Jan 2026 16:00:19 +0000
Subject: [PATCH 70/74] Add mutex to l0 event factory

---
 .../src/runtime/ze/ze_counter_based_event_factory.cpp         | 2 ++
 .../src/runtime/ze/ze_counter_based_event_factory.hpp         | 4 ++++
 src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp     | 2 ++
 src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp     | 3 +++
 4 files changed, 11 insertions(+)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp
index 9e50d3bcc762b9..09ad6800869af5 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp
@@ -27,6 +27,8 @@ ze_counter_based_event_factory::ze_counter_based_event_factory(const ze_engine &
 }
 
 event::ptr ze_counter_based_event_factory::create_event(uint64_t queue_stamp) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
     ze_event_handle_t event;
     auto desc = defaultIntelCounterBasedEventDesc;
     if (is_profiling_enabled()) {
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp
index 90e7af9562628a..6a1c93728d93dc 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp
@@ -6,6 +6,8 @@
 
 #include "ze_base_event_factory.hpp"
 
+#include <mutex>
+
 namespace cldnn {
 namespace ze {
 
@@ -15,6 +17,8 @@ struct ze_counter_based_event_factory : public ze_base_event_factory {
 public:
     ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling);
     event::ptr create_event(uint64_t queue_stamp) override;
+protected:
+    std::mutex _mutex;
 };
 }  // namespace ze
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
index 797b8ef9679651..28bfd4a5b980d6 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp
@@ -18,6 +18,8 @@ ze_event_factory::ze_event_factory(const ze_engine &engine, bool enable_profilin
 , m_num_used(0) { }
 
 event::ptr ze_event_factory::create_event(uint64_t queue_stamp) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
     if (m_num_used >= m_capacity || !m_current_pool) {
         m_num_used = 0;
         ze_event_pool_flags_t flags = is_profiling_enabled() ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp
index 9ed0244446a6ee..d54aff8ecce344 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp
@@ -7,6 +7,8 @@
 #include "ze_base_event_factory.hpp"
 #include "ze_event_pool.hpp"
 
+#include "mutex"
+
 namespace cldnn {
 namespace ze {
 
@@ -16,6 +18,7 @@ struct ze_event_factory : public ze_base_event_factory {
     ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255);
     event::ptr create_event(uint64_t queue_stamp) override;
 protected:
+    std::mutex _mutex;
     std::shared_ptr<ze_event_pool> m_current_pool;
     const uint32_t m_capacity;
     uint32_t m_num_used;

From d71ae1251b54efa752234024b7431168d0da4c25 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Tue, 13 Jan 2026 16:09:58 +0000
Subject: [PATCH 71/74] Adjust ze_events naming

---
 .../intel_gpu/src/runtime/ze/ze_events.cpp    | 16 ++++++-------
 .../intel_gpu/src/runtime/ze/ze_events.hpp    | 24 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
index 852fecf893efa8..f71741c19f6c6c 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp
@@ -13,8 +13,8 @@ using namespace cldnn;
 using namespace ze;
 
 void ze_events::wait_impl() {
-    if (m_last_event) {
-        OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, endless_wait));
+    if (_last_ze_event) {
+        OV_ZE_EXPECT(zeEventHostSynchronize(_last_ze_event, endless_wait));
     }
 }
 
@@ -24,11 +24,11 @@ void ze_events::set_impl() {
 }
 
 bool ze_events::is_set_impl() {
-    if (!m_last_event) {
+    if (!_last_ze_event) {
         return true;
     }
 
-    auto ret = zeEventQueryStatus(m_last_event);
+    auto ret = zeEventQueryStatus(_last_ze_event);
     switch (ret) {
     case ZE_RESULT_SUCCESS:
         return true;
@@ -89,10 +89,10 @@ bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_int
         }
     };
 
-    if (m_events.empty())
+    if (_events.empty())
         return false;
 
-    auto device_info = m_engine.get_device_info();
+    auto device_info = _engine.get_device_info();
 
     auto get_total_exec_time = [&device_info](std::vector<ze_kernel_timestamp_data_t>& all_timestamps) {
         std::chrono::nanoseconds total_time{0};
@@ -130,8 +130,8 @@ bool ze_events::get_profiling_info_impl(std::list<instrumentation::profiling_int
         return wallclock_time - exec_time;
     };
 
-    for (size_t i = 0; i < m_events.size(); i++) {
-        auto be = downcast<ze_base_event>(m_events[i].get());
+    for (size_t i = 0; i < _events.size(); i++) {
+        auto be = downcast<ze_base_event>(_events[i].get());
         auto opt_timestamp = be->query_timestamp();
         if (!opt_timestamp.has_value()) {
             continue;
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
index 9fadd2c6f1a06b..b38a38303e8b59 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp
@@ -16,17 +16,17 @@ struct ze_events : public ze_base_event {
 public:
     ze_events(std::vector<event::ptr> const& ev, const ze_engine &engine)
         : ze_base_event(0)
-        , m_engine(engine) {
+        , _engine(engine) {
         process_events(ev);
     }
 
     void reset() override {
         event::reset();
-        m_events.clear();
+        _events.clear();
     }
 
     std::optional<ze_kernel_timestamp_result_t> query_timestamp() override { return std::nullopt; }
-    ze_event_handle_t get_handle() const override { return m_last_event; }
+    ze_event_handle_t get_handle() const override { return _last_ze_event; }
     bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
 
 protected:
@@ -38,32 +38,32 @@ struct ze_events : public ze_base_event {
         for (size_t i = 0; i < ev.size(); i++) {
             auto multiple_events = dynamic_cast<ze_events*>(ev[i].get());
             if (multiple_events) {
-                for (size_t j = 0; j < multiple_events->m_events.size(); j++) {
-                    if (auto base_ev = dynamic_cast<ze_base_event*>(multiple_events->m_events[j].get())) {
+                for (size_t j = 0; j < multiple_events->_events.size(); j++) {
+                    if (auto base_ev = dynamic_cast<ze_base_event*>(multiple_events->_events[j].get())) {
                         auto current_ev_queue_stamp = base_ev->get_queue_stamp();
                         if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) {
                             m_queue_stamp = current_ev_queue_stamp;
-                            m_last_event = base_ev->get_handle();
+                            _last_ze_event = base_ev->get_handle();
                         }
                     }
-                    m_events.push_back(multiple_events->m_events[j]);
+                    _events.push_back(multiple_events->_events[j]);
                 }
             } else {
                 if (auto base_ev = dynamic_cast<ze_base_event*>(ev[i].get())) {
                     auto current_ev_queue_stamp = base_ev->get_queue_stamp();
                     if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) {
                         m_queue_stamp = current_ev_queue_stamp;
-                        m_last_event = base_ev->get_handle();
+                        _last_ze_event = base_ev->get_handle();
                     }
                 }
-                m_events.push_back(ev[i]);
+                _events.push_back(ev[i]);
             }
         }
     }
 
-    ze_event_handle_t m_last_event = nullptr;
-    std::vector<event::ptr> m_events;
-    const ze_engine &m_engine;
+    ze_event_handle_t _last_ze_event = nullptr;
+    std::vector<event::ptr> _events;
+    const ze_engine &_engine;
 };
 
 }  // namespace ze

From 0a4884601a8e9a7730e1de6cb01bf7b14f427547 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 14 Jan 2026 12:45:43 +0000
Subject: [PATCH 72/74] Remove unnecessary comment

---
 src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
index 4a7bdcface6c4b..e3d0e924fa2589 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp
@@ -152,7 +152,6 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vecto
         ev_ze,
         ze_dep_events.size(),
         ze_dep_events.data()));
-    // FIXME: when not blocking pattern goes out of scope
     if (blocking) {
         ev->wait();
     }

From ae5afd8612d9bee7bfbcfe17c2ecfe112355d934 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Wed, 14 Jan 2026 15:17:17 +0000
Subject: [PATCH 73/74] Improve engine and runtime type selection

---
 .../intel_gpu/plugin/remote_context.hpp       | 20 ----------
 .../intel_gpu/runtime/device_query.hpp        | 13 +++++++
 src/plugins/intel_gpu/src/plugin/plugin.cpp   |  3 +-
 .../intel_gpu/src/plugin/remote_context.cpp   |  9 ++---
 .../intel_gpu/src/runtime/device_query.cpp    | 38 +++++++++++++++++++
 .../intel_gpu/src/runtime/ze/ze_stream.cpp    | 12 ++++--
 .../tests/unit/test_utils/test_utils.cpp      |  7 +---
 7 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
index 3606e95e5d9521..e210d332cc6296 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
@@ -23,26 +23,6 @@
 
 namespace ov::intel_gpu {
 
-inline std::pair<cldnn::engine_types, cldnn::runtime_types> get_device_query_params() {
-    #ifdef OV_GPU_WITH_ZE_RT
-        auto runtime_type = cldnn::runtime_types::ze;
-        #ifdef OV_GPU_WITH_SYCL
-            auto engine_type = cldnn::engine_types::sycl;
-        #else
-            auto engine_type = cldnn::engine_types::ze;
-        #endif
-    #else
-        auto runtime_type = cldnn::runtime_types::ocl;
-        #ifdef OV_GPU_WITH_SYCL
-            auto engine_type = cldnn::engine_types::sycl;
-        #else
-            auto engine_type = cldnn::engine_types::ocl;
-        #endif
-    #endif
-
-    return {engine_type, runtime_type};
-}
-
 class RemoteContextImpl : public ov::IRemoteContext {
 public:
     using Ptr = std::shared_ptr<RemoteContextImpl>;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp
index ecb82795c0d920..d5291f50486152 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp
@@ -17,6 +17,12 @@ namespace cldnn {
 struct device_query {
 public:
     static int device_id;
+    /// @brief Get default engine type
+    static engine_types get_default_engine_type();
+
+    /// @brief Get default runtime type
+    static runtime_types get_default_runtime_type();
+
     explicit device_query(engine_types engine_type,
                           runtime_types runtime_type,
                           void* user_context = nullptr,
@@ -25,6 +31,13 @@ struct device_query {
                           int target_tile_id = -1,
                           bool initialize_devices = false);
 
+    /// @brief Create device query with default values for engine type and runtime type
+    explicit device_query(void* user_context = nullptr,
+                          void* user_device = nullptr,
+                          int ctx_device_id = 0,
+                          int target_tile_id = -1,
+                          bool initialize_devices = false);
+
     std::map<std::string, device::ptr> get_available_devices() const {
         return _available_devices;
     }
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index 34ac490f3b2a09..e8c18242695624 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -217,8 +217,7 @@ Plugin::Plugin() {
     set_device_name("GPU");
     register_primitives();
 
-    auto rt_params = get_device_query_params();
-    cldnn::device_query device_query(rt_params.first, rt_params.second);
+    cldnn::device_query device_query;
     m_device_map = device_query.get_available_devices();
 
     // Set default configs for each device
diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
index ac68496feb2d97..63c8bd043be974 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
@@ -65,11 +65,9 @@ RemoteContextImpl::RemoteContextImpl(const std::map<std::string, RemoteContextIm
         }
     }
 
-    auto rt_params = get_device_query_params();
     const auto initialize_devices = true;
 
-    // Use actual runtime and engine types
-    cldnn::device_query device_query(rt_params.first, rt_params.second, context_id, m_va_display, ctx_device_id, target_tile_id, initialize_devices);
+    cldnn::device_query device_query(context_id, m_va_display, ctx_device_id, target_tile_id, initialize_devices);
     auto device_map = device_query.get_available_devices();
 
     OPENVINO_ASSERT(device_map.size() == 1, "[GPU] Exactly one device expected in case of context sharing, but ", device_map.size(), " found");
@@ -245,10 +243,9 @@ void RemoteContextImpl::initialize() {
     std::call_once(m_initialize_flag, [this]() {
         GPU_DEBUG_INFO << "Initialize RemoteContext for " << m_device_name << " (" << m_device->get_info().dev_name << ")" << std::endl;
 
-        auto rt_params = get_device_query_params();
-
         m_device->initialize();  // Initialize associated device before use
-        m_engine = cldnn::engine::create(rt_params.first, rt_params.second, m_device);
+        m_engine = cldnn::engine::create(
+            cldnn::device_query::get_default_engine_type(), cldnn::device_query::get_default_runtime_type(), m_device);
 
         init_properties();
 
diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp
index 7ec5d6d90edabe..5e6ba1576a17c3 100644
--- a/src/plugins/intel_gpu/src/runtime/device_query.cpp
+++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp
@@ -10,6 +10,44 @@
 
 namespace cldnn {
 int device_query::device_id = -1;
+
+engine_types device_query::get_default_engine_type() {
+    auto engine_type = engine_types::ocl;
+#ifdef OV_GPU_WITH_ZE_RT
+    engine_type = engine_types::ze;
+#endif
+#ifdef OV_GPU_WITH_OCL_RT
+    engine_type = engine_types::ocl;
+#endif
+#ifdef OV_GPU_WITH_SYCL
+    engine_type = engine_types::sycl;
+#endif
+    return engine_type;
+}
+runtime_types device_query::get_default_runtime_type() {
+    auto rt_type = runtime_types::ocl;
+#ifdef OV_GPU_WITH_ZE_RT
+    rt_type = runtime_types::ze;
+#endif
+#ifdef OV_GPU_WITH_OCL_RT
+    rt_type = runtime_types::ocl;
+#endif
+    return rt_type;
+}
+
+device_query::device_query(void* user_context,
+                           void* user_device,
+                           int ctx_device_id,
+                           int target_tile_id,
+                           bool initialize_devices)
+    : device_query(get_default_engine_type(),
+        get_default_runtime_type(),
+        user_context,
+        user_device,
+        ctx_device_id,
+        target_tile_id,
+        initialize_devices) {}
+
 device_query::device_query(engine_types engine_type,
                            runtime_types runtime_type,
                            void* user_context,
diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
index ab0e9991406ce1..1050c9b753fa76 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp
@@ -206,18 +206,22 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config)
     cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES;
     cp_offload_desc.copyOffloadEnabled = true;
     cp_offload_desc.pNext = nullptr;
-    if (info.supports_cp_offload) {
+    bool use_cp_offload = info.supports_cp_offload;
+    if (use_cp_offload) {
         command_queue_desc.pNext = &cp_offload_desc;
-    } else {
-        GPU_DEBUG_INFO << "Copy offload hint is not supported" << std::endl;
     }
 
     OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list));
-    if (m_queue_type == QueueTypes::in_order && info.supports_counter_based_events) {
+    bool use_counter_based_events = m_queue_type == QueueTypes::in_order && info.supports_counter_based_events;
+    if (use_counter_based_events) {
         m_ev_factory = std::make_unique<ze_counter_based_event_factory>(engine, config.get_enable_profiling());
     } else {
         m_ev_factory = std::make_unique<ze_event_factory>(engine, config.get_enable_profiling());
     }
+    GPU_DEBUG_INFO << "[GPU] Created L0 stream ("
+        << "use_cp_offload=" << use_cp_offload
+        << ", use_counter_based_events=" << use_counter_based_events
+        << ")" << std::endl;
 }
 
 ze_stream::~ze_stream() {
diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
index fe012bf8da99f9..be8e390749b801 100644
--- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp
@@ -306,11 +306,8 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine,
 }
 
 std::shared_ptr<cldnn::engine> create_test_engine() {
-#ifdef OV_GPU_WITH_ZE_RT
-    auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze);
-#elif OV_GPU_WITH_OCL_RT
-    auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl);
-#endif
+    auto ret = cldnn::engine::create(
+            cldnn::device_query::get_default_engine_type(), cldnn::device_query::get_default_runtime_type());
 #ifdef ENABLE_ONEDNN_FOR_GPU
     if (ret->get_device_info().supports_immad)
         ret->create_onednn_engine({});

From d964b511aa09324e85ac35a223df297975355c63 Mon Sep 17 00:00:00 2001
From: "Kasprzak, Jakub" <jakub.kasprzak@intel.com>
Date: Fri, 16 Jan 2026 12:42:03 +0000
Subject: [PATCH 74/74] Fix typo in assert

---
 src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
index d00750696e2c06..5168e0364c4bd8 100644
--- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp
@@ -127,7 +127,7 @@ memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_l
     if (new_layout.format.is_image_2d()) {
         OPENVINO_NOT_IMPLEMENTED;
     }
-    OPENVINO_ASSERT(!memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory");
+    OPENVINO_ASSERT(memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory");
     auto& new_buf = reinterpret_cast<const ze::gpu_usm&>(memory);
     auto ptr = new_buf.get_buffer().get();
     auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset);