From ba12ecb18c53baa55bfeaa2b59cb77f49a169e83 Mon Sep 17 00:00:00 2001
From: "Palicki, Stefan" <stefan.palicki@intel.com>
Date: Tue, 22 Jul 2025 08:34:47 -0700
Subject: [PATCH] gpu: intel: add Level Zero backend

---
 README.md                                     |   5 +-
 cmake/OpenCL.cmake                            |   2 +-
 cmake/options.cmake                           |   2 +-
 doc/build/build_options.md                    |   2 +-
 examples/CMakeLists.txt                       |   5 +-
 examples/CMakeLists.txt.in                    |   2 +-
 examples/example_utils.h                      |   5 +-
 examples/example_utils.hpp                    |  10 +-
 include/dnnl_l0.h                             |  22 +
 include/dnnl_l0.hpp                           |  22 +
 include/oneapi/dnnl/dnnl_config.h.in          |   9 +-
 include/oneapi/dnnl/dnnl_l0.h                 | 203 +++++++++
 include/oneapi/dnnl/dnnl_l0.hpp               | 260 +++++++++++
 src/CMakeLists.txt                            |   2 +-
 src/common/c_types_map.hpp                    |   2 +
 src/common/engine.cpp                         |  22 +-
 src/common/engine.hpp                         |   2 +
 src/common/utils.cpp                          |   3 +-
 src/gpu/intel/CMakeLists.txt                  |   3 +
 src/gpu/intel/compute/ukernels.cpp            |  26 +-
 src/gpu/intel/compute/utils.cpp               | 119 +++++
 src/gpu/intel/compute/utils.hpp               |  14 +-
 .../intel/gemm/jit/dsl/ir/codegen/codegen.cpp |  51 +++
 src/gpu/intel/gemm/jit/dsl/runtime.cpp        |  14 +
 .../jit/include/gemmstone/dsl/runtime.hpp     |   5 +
 .../gemm/jit/include/gemmstone/generator.hpp  |   8 +
 .../gemm/jit/include/gemmstone/runtime.hpp    |   5 +
 src/gpu/intel/jit/binary_format.cpp           |  12 +-
 src/gpu/intel/jit/config/gemmstone_config.hpp |   3 +
 src/gpu/intel/jit/generator.hpp               |  20 +
 src/gpu/intel/l0/CMakeLists.txt               |  24 ++
 src/gpu/intel/l0/capi/engine.cpp              |  73 ++++
 src/gpu/intel/l0/capi/memory.cpp              | 128 ++++++
 src/gpu/intel/l0/capi/primitive.cpp           |  67 +++
 src/gpu/intel/l0/capi/stream.cpp              |  57 +++
 src/gpu/intel/l0/compiler.hpp                 |  80 ++++
 src/gpu/intel/l0/context.hpp                  | 103 +++++
 src/gpu/intel/l0/device_info.cpp              | 148 +++++++
 src/gpu/intel/l0/device_info.hpp              |  44 ++
 src/gpu/intel/l0/engine.cpp                   | 325 ++++++++++++++
 src/gpu/intel/l0/engine.hpp                   | 119 +++++
 src/gpu/intel/l0/engine_factory.cpp           |  88 ++++
 src/gpu/intel/l0/engine_factory.hpp           |  59 +++
 src/gpu/intel/l0/kernel.cpp                   | 215 +++++++++
 src/gpu/intel/l0/kernel.hpp                   |  76 ++++
 src/gpu/intel/l0/memory_storage.cpp           | 215 +++++++++
 src/gpu/intel/l0/memory_storage.hpp           |  91 ++++
 src/gpu/intel/l0/stream.cpp                   | 233 ++++++++++
 src/gpu/intel/l0/stream.hpp                   | 136 ++++++
 src/gpu/intel/l0/stream_profiler.hpp          | 140 ++++++
 src/gpu/intel/l0/utils/CMakeLists.txt         |  24 ++
 src/gpu/intel/l0/utils/utils.cpp              | 408 ++++++++++++++++++
 src/gpu/intel/l0/utils/utils.hpp              | 234 ++++++++++
 src/gpu/intel/ocl/CMakeLists.txt              |   2 -
 src/gpu/intel/ocl/device_info.cpp             |  12 +-
 src/gpu/intel/ocl/device_info.hpp             |   2 +-
 src/gpu/intel/ocl/engine.cpp                  |  33 +-
 src/gpu/intel/ocl/engine.hpp                  |   4 +-
 src/gpu/intel/ocl/hw_info.cpp                 |   2 +-
 src/gpu/intel/ocl/kernel.cpp                  |   2 +-
 src/gpu/intel/ocl/mdapi_utils.cpp             |   2 +-
 src/gpu/intel/ocl/stream.cpp                  |   2 +-
 src/gpu/intel/ocl/stream.hpp                  |   4 +-
 src/gpu/intel/ocl/utils/CMakeLists.txt        |  24 ++
 src/gpu/intel/ocl/{ => utils}/utils.cpp       |  66 +--
 src/gpu/intel/ocl/{ => utils}/utils.hpp       |   9 +-
 src/gpu/intel/sycl/compat.cpp                 |   2 +-
 src/gpu/intel/sycl/device_info.cpp            |   8 +-
 src/gpu/intel/sycl/engine.cpp                 |  14 +-
 src/gpu/intel/sycl/engine.hpp                 |   2 +-
 src/gpu/intel/sycl/interop_kernel.cpp         |   3 +-
 src/gpu/intel/sycl/l0/utils.cpp               | 370 ----------------
 src/gpu/intel/sycl/l0/utils.hpp               |  60 ---
 src/gpu/intel/sycl/stream.cpp                 |   2 +-
 src/gpu/intel/sycl/stream.hpp                 |   2 +
 src/gpu/intel/sycl/utils.cpp                  | 109 +++--
 src/gpu/intel/sycl/utils.hpp                  |  10 +
 src/graph/backend/dnnl/scratchpad.hpp         |   2 -
 src/graph/utils/ocl_check.hpp                 |   2 +-
 src/xpu/ocl/buffer_memory_storage.hpp         |   2 +-
 src/xpu/ocl/utils.cpp                         | 229 +---------
 src/xpu/ocl/utils.hpp                         | 251 ++++++++++-
 src/xpu/sycl/utils.cpp                        |   2 +-
 third_party/ngen/ngen_level_zero.hpp          |  16 +-
 84 files changed, 4318 insertions(+), 879 deletions(-)
 create mode 100644 include/dnnl_l0.h
 create mode 100644 include/dnnl_l0.hpp
 create mode 100644 include/oneapi/dnnl/dnnl_l0.h
 create mode 100644 include/oneapi/dnnl/dnnl_l0.hpp
 create mode 100644 src/gpu/intel/compute/utils.cpp
 create mode 100644 src/gpu/intel/l0/CMakeLists.txt
 create mode 100644 src/gpu/intel/l0/capi/engine.cpp
 create mode 100644 src/gpu/intel/l0/capi/memory.cpp
 create mode 100644 src/gpu/intel/l0/capi/primitive.cpp
 create mode 100644 src/gpu/intel/l0/capi/stream.cpp
 create mode 100644 src/gpu/intel/l0/compiler.hpp
 create mode 100644 src/gpu/intel/l0/context.hpp
 create mode 100644 src/gpu/intel/l0/device_info.cpp
 create mode 100644 src/gpu/intel/l0/device_info.hpp
 create mode 100644 src/gpu/intel/l0/engine.cpp
 create mode 100644 src/gpu/intel/l0/engine.hpp
 create mode 100644 src/gpu/intel/l0/engine_factory.cpp
 create mode 100644 src/gpu/intel/l0/engine_factory.hpp
 create mode 100644 src/gpu/intel/l0/kernel.cpp
 create mode 100644 src/gpu/intel/l0/kernel.hpp
 create mode 100644 src/gpu/intel/l0/memory_storage.cpp
 create mode 100644 src/gpu/intel/l0/memory_storage.hpp
 create mode 100644 src/gpu/intel/l0/stream.cpp
 create mode 100644 src/gpu/intel/l0/stream.hpp
 create mode 100644 src/gpu/intel/l0/stream_profiler.hpp
 create mode 100644 src/gpu/intel/l0/utils/CMakeLists.txt
 create mode 100644 src/gpu/intel/l0/utils/utils.cpp
 create mode 100644 src/gpu/intel/l0/utils/utils.hpp
 create mode 100644 src/gpu/intel/ocl/utils/CMakeLists.txt
 rename src/gpu/intel/ocl/{ => utils}/utils.cpp (82%)
 rename src/gpu/intel/ocl/{ => utils}/utils.hpp (92%)
 delete mode 100644 src/gpu/intel/sycl/l0/utils.cpp
 delete mode 100644 src/gpu/intel/sycl/l0/utils.hpp

diff --git a/README.md b/README.md
index a00612e7097..a2cce3fe2c2 100644
--- a/README.md
+++ b/README.md
@@ -279,9 +279,9 @@ Runtime-specific dependencies:
 | `ONEDNN_CPU_RUNTIME=OMP`   | Intel C/C++ Compiler          | Intel OpenMP runtime (`libiomp5.so`)
 | `ONEDNN_CPU_RUNTIME=OMP`   | Clang                         | Intel OpenMP runtime (`libiomp5.so`)
 | `ONEDNN_CPU_RUNTIME=TBB`   | any                           | TBB (`libtbb.so`)
-| `ONEDNN_CPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), TBB (`libtbb.so`), OpenCL loader (`libOpenCL.so`)
+| `ONEDNN_CPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), TBB (`libtbb.so`), OpenCL loader (`libOpenCL.so`), oneAPI Level Zero loader (`libze_loader.so`)
 | `ONEDNN_GPU_RUNTIME=OCL`   | any                           | OpenCL loader (`libOpenCL.so`)
-| `ONEDNN_GPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), OpenCL loader (`libOpenCL.so`), oneAPI Level Zero loader (`libze_loader.so`)
+| `ONEDNN_GPU_RUNTIME=L0`    | any                           | oneAPI Level Zero loader (`libze_loader.so`)
 
 #### Windows
 
@@ -298,6 +298,7 @@ Runtime-specific dependencies:
 | `ONEDNN_CPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), TBB (`tbb.dll`), OpenCL loader (`OpenCL.dll`)
 | `ONEDNN_GPU_RUNTIME=OCL`   | any                           | OpenCL loader (`OpenCL.dll`)
 | `ONEDNN_GPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), OpenCL loader (`OpenCL.dll`), oneAPI Level Zero loader (`ze_loader.dll`)
+| `ONEDNN_GPU_RUNTIME=L0`    | any                           | oneAPI Level Zero loader (`ze_loader.dll`)
 
 #### macOS
 
diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake
index 3ab84a1bf03..23bcec0254d 100644
--- a/cmake/OpenCL.cmake
+++ b/cmake/OpenCL.cmake
@@ -25,7 +25,7 @@ set(OpenCL_cmake_included true)
 if(DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL")
     add_definitions_with_host_compiler(-DCL_TARGET_OPENCL_VERSION=300)
 else()
-    add_definitions(-DCL_TARGET_OPENCL_VERSION=120)
+    add_definitions(-DCL_TARGET_OPENCL_VERSION=300)
 endif()
 
 if(OpenCL_INCLUDE_DIR)
diff --git a/cmake/options.cmake b/cmake/options.cmake
index a707c670e2a..bc5c36ca450 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -287,7 +287,7 @@ set(DNNL_GPU_RUNTIME "NONE" CACHE STRING
 
     Using OpenCL for GPU requires setting OPENCLROOT if the libraries are
     installed in a non-standard location.")
-if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
+if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL|L0)$")
     message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
 endif()
 
diff --git a/doc/build/build_options.md b/doc/build/build_options.md
index fabe37c55e6..a29adbe5c4f 100644
--- a/doc/build/build_options.md
+++ b/doc/build/build_options.md
@@ -7,7 +7,7 @@ oneDNN supports the following build-time options.
 |:--------------------------------|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------|
 | ONEDNN_LIBRARY_TYPE             | **SHARED**, STATIC                                  | Defines the resulting library type                                                                                 |
 | ONEDNN_CPU_RUNTIME              | NONE, **OMP**, TBB, SEQ, THREADPOOL, SYCL           | Defines the threading runtime for CPU engines                                                                      |
-| ONEDNN_GPU_RUNTIME              | **NONE**, OCL, SYCL                                 | Defines the offload runtime for GPU engines                                                                        |
+| ONEDNN_GPU_RUNTIME              | **NONE**, OCL, SYCL, L0                             | Defines the offload runtime for GPU engines                                                                        |
 | ONEDNN_BUILD_DOC                | **ON**, OFF                                         | Controls building the documentation                                                                                |
 | ONEDNN_DOC_VERSIONS_JSON        | **""**, *string*                                    | Location of JSON file for [PyData Sphinx Theme version switcher]. Enables documentation version switcher when set. |
 | ONEDNN_BUILD_EXAMPLES           | **ON**, OFF                                         | Controls building the examples                                                                                     |
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ddb7f745177..b0b6728ef73 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -147,6 +147,9 @@ foreach(f ${sources})
     if(NOT DNNL_WITH_SYCL AND ${f_name} MATCHES "^sycl")
         list(REMOVE_ITEM sources "${f}")
     endif()
+    if(NOT DNNL_GPU_RUNTIME STREQUAL "L0" AND ${f_name} MATCHES ".*level_zero")
+        list(REMOVE_ITEM sources "${f}")
+    endif()
 endforeach()
 
 # In case of SYCL, skip CPU examples that directly work with raw pointers
@@ -202,7 +205,7 @@ foreach(src ${sources})
         endif()
     else()
         set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
-        set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
+        set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
         if(${example_name} MATCHES "sycl.*")
             set(cpu_rt_pattern "(SYCL|DPCPP)")
             set(gpu_rt_pattern "(SYCL|DPCPP)")
diff --git a/examples/CMakeLists.txt.in b/examples/CMakeLists.txt.in
index 0eec7cdaf41..a5af8163729 100644
--- a/examples/CMakeLists.txt.in
+++ b/examples/CMakeLists.txt.in
@@ -214,7 +214,7 @@ foreach(src ${sources})
         endif()
     else()
         set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
-        set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
+        set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
         if(${example_name} MATCHES "sycl.*")
             set(cpu_rt_pattern "(SYCL|DPCPP)")
             set(gpu_rt_pattern "(SYCL|DPCPP)")
diff --git a/examples/example_utils.h b/examples/example_utils.h
index eefd64025bb..c445cdeff68 100644
--- a/examples/example_utils.h
+++ b/examples/example_utils.h
@@ -29,6 +29,9 @@
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "dnnl_ocl.h"
 #endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "dnnl_l0.h"
+#endif
 
 #define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \
     do { \
@@ -160,7 +163,7 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) {
     }
 #endif
 
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
     if (eng_kind == dnnl_gpu) {
         void *mapped_ptr = NULL;
         CHECK(dnnl_memory_map_data(mem, &mapped_ptr));
diff --git a/examples/example_utils.hpp b/examples/example_utils.hpp
index 07a7951f912..8791fd17926 100644
--- a/examples/example_utils.hpp
+++ b/examples/example_utils.hpp
@@ -35,7 +35,11 @@
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "dnnl_ocl.hpp"
-#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "dnnl_l0.hpp"
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
 #include "dnnl_sycl.hpp"
 #endif
 
@@ -228,7 +232,7 @@ inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
         return;
     }
 #endif
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
     if (eng.get_kind() == dnnl::engine::kind::gpu) {
         void *mapped_ptr = mem.map_data();
         if (mapped_ptr) std::memcpy(handle, mapped_ptr, size);
@@ -287,7 +291,7 @@ inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
         return;
     }
 #endif
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
     if (eng.get_kind() == dnnl::engine::kind::gpu) {
         void *mapped_ptr = mem.map_data();
         if (mapped_ptr) std::memcpy(mapped_ptr, handle, size);
diff --git a/include/dnnl_l0.h b/include/dnnl_l0.h
new file mode 100644
index 00000000000..21ac50517c0
--- /dev/null
+++ b/include/dnnl_l0.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_L0_H
+#define DNNL_L0_H
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#endif /* DNNL_L0_H */
diff --git a/include/dnnl_l0.hpp b/include/dnnl_l0.hpp
new file mode 100644
index 00000000000..db2d3137b03
--- /dev/null
+++ b/include/dnnl_l0.hpp
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_L0_HPP
+#define DNNL_L0_HPP
+
+#include "oneapi/dnnl/dnnl_l0.hpp"
+
+#endif /* DNNL_L0_HPP */
diff --git a/include/oneapi/dnnl/dnnl_config.h.in b/include/oneapi/dnnl/dnnl_config.h.in
index 11e1b6d98ad..29e3246b52d 100644
--- a/include/oneapi/dnnl/dnnl_config.h.in
+++ b/include/oneapi/dnnl/dnnl_config.h.in
@@ -82,6 +82,9 @@
 /// DPC++ runtime
 #define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL
 
+/// L0 runtime
+#define DNNL_RUNTIME_L0 1024u
+
 /// No vendor (corresponding runtime is disabled)
 #define DNNL_VENDOR_NONE 0u
 
@@ -119,7 +122,8 @@
 #endif
 #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
         && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \
-        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL)
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) \
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_L0)
 #error "Unexpected DNNL_GPU_RUNTIME"
 #endif
 #if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
@@ -145,9 +149,6 @@
 // When defined, DPCPP is supported.
 #cmakedefine DNNL_WITH_SYCL
 
-// When defined, Level Zero is supported.
-#cmakedefine DNNL_WITH_LEVEL_ZERO
-
 // When defined, SYCL CUDA backend is used.
 #cmakedefine DNNL_SYCL_CUDA
 
diff --git a/include/oneapi/dnnl/dnnl_l0.h b/include/oneapi/dnnl/dnnl_l0.h
new file mode 100644
index 00000000000..543453a87e6
--- /dev/null
+++ b/include/oneapi/dnnl/dnnl_l0.h
@@ -0,0 +1,203 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_L0_H
+#define ONEAPI_DNNL_DNNL_L0_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include "level_zero/ze_api.h"
+/// @endcond
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_l0_interop
+/// @{
+
+/// Creates an engine associated with a Level Zero device and a Level Zero context.
+///
+/// @param engine Output engine.
+/// @param driver Pointer to the Level Zero driver to use for the engine.
+/// @param device Pointer to the Level Zero device to use for the engine.
+/// @param context Pointer to the Level Zero context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
+        const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
+        const ze_context_handle_t acontext);
+
+/// Returns the Level Zero context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Pointer to the underlying Level Zero context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_context(
+        dnnl_engine_t engine, ze_context_handle_t context);
+
+/// Returns the Level Zero device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Pointer to the underlying Level Zero device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_device(
+        dnnl_engine_t engine, ze_device_handle_t device);
+
+/// Returns the Level Zero driver associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Pointer to the underlying Level Zero driver of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_driver(
+        dnnl_engine_t engine, ze_driver_handle_t driver);
+
+/// Creates an execution stream for a given engine associated with a Level Zero
+/// command list.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param list Level Zero command list to use.
+/// @param profiling Flag enabling GPU kernel profiling.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
+        dnnl_engine_t engine, ze_command_list_handle_t list, bool profiling);
+
+/// Returns the Level Zero command list associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param list Output Level Zero command list.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_stream_get_list(
+        dnnl_stream_t stream, ze_command_list_handle_t list);
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal
+///   to dnnl_l0_interop_usm, or
+/// - dnnl_l0_interop_memory_set_buffer() has been called, if @p memory_kind
+///   is equal to dnnl_l0_interop_buffer.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        void *handle);
+
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        int nhandles, void **handles);
+
+/// Returns an Level Zero memory object associated with a memory object.
+///
+/// @param memory Memory object.
+/// @param mem_object Output Level Zero memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
+        const_dnnl_memory_t memory, void **mem_object);
+
+/// Sets Level Zero memory object associated with a memory object.
+///
+/// For behavioral details, see dnnl_memory_set_data_handle().
+///
+/// @param memory Memory object.
+/// @param mem_object Level Zero memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
+        dnnl_memory_t memory, void *mem_object);
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a Level Zero event.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @param ndeps Number of dependencies.
+/// @param deps A pointer to a vector of size @p ndeps that contains
+///     dependencies.
+/// @param return_event Output event.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_primitive_execute(
+        const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs,
+        const dnnl_exec_arg_t *args, int ndeps, const ze_event_handle_t *deps,
+        ze_event_handle_t *return_event);
+
+/// @} dnnl_api_l0_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // ONEAPI_DNNL_DNNL_L0_H
diff --git a/include/oneapi/dnnl/dnnl_l0.hpp b/include/oneapi/dnnl/dnnl_l0.hpp
new file mode 100644
index 00000000000..00762b4af03
--- /dev/null
+++ b/include/oneapi/dnnl/dnnl_l0.hpp
@@ -0,0 +1,260 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_L0_HPP
+#define ONEAPI_DNNL_DNNL_L0_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl_l0.h"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_l0_interop Level Zero interoperability API
+/// API extensions to interact with the underlying Level Zero run-time.
+///
+/// @sa @ref dev_guide_dpcpp_interoperability in developer guide
+/// @{
+
+/// Level Zero interoperability namespace
+namespace l0_interop {
+
+/// Constructs an engine from Level Zero device and context objects.
+///
+/// @param adriver Level Zero driver.
+/// @param adevice Level Zero device.
+/// @param acontext Level Zero context.
+///
+/// @returns Created engine.
+inline engine make_engine(const ze_driver_handle_t adriver,
+        const ze_device_handle_t adevice, const ze_context_handle_t acontext) {
+    dnnl_engine_t aengine;
+    error::wrap_c_api(
+            dnnl_l0_interop_engine_create(&aengine, adriver, adevice, acontext),
+            "could not create an engine");
+    return engine(aengine);
+}
+
+/// Returns the Level Zero context associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying Level Zero device of the engine.
+inline ze_context_handle_t get_context(const engine &aengine) {
+    ze_context_handle_t ctx = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_engine_get_context(aengine.get(), ctx),
+            "could not get a context handle");
+    return ctx;
+}
+
+/// Returns the Level Zero device associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying Level Zero context of the engine.
+inline ze_device_handle_t get_device(const engine &aengine) {
+    ze_device_handle_t dev = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_engine_get_device(aengine.get(), dev),
+            "could not get a device handle");
+    return dev;
+}
+
+/// Returns the Level Zero driver associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying Level Zero driver of the engine.
+inline ze_driver_handle_t get_driver(const engine &aengine) {
+    ze_driver_handle_t dri = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_engine_get_driver(aengine.get(), dri),
+            "could not get a driver handle");
+    return dri;
+}
+
+/// Creates an execution stream for a given engine associated with a Level Zero
+/// command list.
+///
+/// @param aengine Engine object to use for the stream.
+/// @param alist Level Zero immediate command list to use for the stream.
+/// @param aprofiling Flag enabling GPU kernel profiling.
+///
+/// @returns An execution stream.
+inline stream make_stream(const engine &aengine, ze_command_list_handle_t alist,
+        bool aprofiling = false) {
+    dnnl_stream_t astream;
+    error::wrap_c_api(dnnl_l0_interop_stream_create(
+                              &astream, aengine.get(), alist, aprofiling),
+            "could not create a stream");
+    return stream(astream);
+}
+
+/// Returns the Level Zero immediate command list associated with an execution stream.
+///
+/// @param astream Execution stream to query.
+///
+/// @returns Level Zero immediate command list object.
+inline ze_command_list_handle_t get_list(const stream &astream) {
+    ze_command_list_handle_t list = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_stream_get_list(astream.get(), list),
+            "could not get a stream handle");
+    return list;
+}
+
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_l0_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), handles.size(), handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::l0_interop::memory_kind::usm, or
+/// - dnnl::l0_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::l0_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(
+        const memory::desc &memory_desc, const engine &aengine, void *handle) {
+    return make_memory(memory_desc, aengine, std::vector<void *> {handle});
+}
+
+/// Returns the Level Zero memory object associated with the memory object.
+///
+/// @param amemory A memory object.
+/// @returns Underlying Level Zero memory object.
+inline void *get_mem_object(const memory &amemory) {
+    void *mem_object;
+    error::wrap_c_api(
+            dnnl_l0_interop_memory_get_mem_object(amemory.get(), &mem_object),
+            "could not get Level Zero buffer object from a memory object");
+    return mem_object;
+}
+
+/// Sets the Level Zero memory object associated with the memory object.
+///
+/// For behavioral details see memory::set_data_handle().
+///
+/// @param amemory A memory object.
+/// @param mem_object Level Zero cl_mem object to use as the underlying
+///     storage. It must have at least get_desc().get_size() bytes
+///     allocated.
+inline void set_mem_object(memory &amemory, void *mem_object) {
+    error::wrap_c_api(
+            dnnl_l0_interop_memory_set_mem_object(amemory.get(), mem_object),
+            "could not set Level Zero buffer object from a memory object");
+}
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a Level Zero event.
+///
+/// Arguments are passed via an arguments map containing
+/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
+/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+/// matching the one returned by
+/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
+/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+///
+/// @param aprimitive Primitive to execute.
+/// @param astream Stream object. The stream must belong to the same engine
+///     as the primitive.
+/// @param args Arguments map.
+/// @param deps Optional vector with `ze_event_handle_t` dependencies.
+///
+/// @returns Output event.
+inline ze_event_handle_t execute(const dnnl::primitive &aprimitive,
+        const stream &astream, const std::unordered_map<int, memory> &args,
+        const std::vector<ze_event_handle_t> &deps = {}) {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get()});
+
+    const ze_event_handle_t *c_deps = deps.empty() ? nullptr : deps.data();
+
+    ze_event_handle_t return_event;
+    error::wrap_c_api(dnnl_l0_interop_primitive_execute(aprimitive.get(),
+                              astream.get(), c_args.size(), c_args.data(),
+                              deps.size(), c_deps, &return_event),
+            "could not execute a primitive");
+    return return_event;
+}
+
+} // namespace l0_interop
+
+/// @} dnnl_api_l0_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif // ONEAPI_DNNL_DNNL_L0_HPP
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54b3627675c..e392c435c05 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -135,7 +135,7 @@ if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE")
     add_subdirectory(gpu)
 endif()
 
-if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL")
+if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME MATCHES "OCL")
     add_subdirectory(xpu)
 endif()
 
diff --git a/src/common/c_types_map.hpp b/src/common/c_types_map.hpp
index 0d48d18d946..f29fe78d7e2 100644
--- a/src/common/c_types_map.hpp
+++ b/src/common/c_types_map.hpp
@@ -1969,6 +1969,7 @@ enum runtime_kind_t {
     dnnl_runtime_threadpool,
     dnnl_runtime_ocl,
     dnnl_runtime_sycl,
+    dnnl_runtime_l0,
 };
 
 namespace runtime_kind {
@@ -1979,6 +1980,7 @@ const runtime_kind_t tbb = dnnl_runtime_tbb;
 const runtime_kind_t threadpool = dnnl_runtime_threadpool;
 const runtime_kind_t ocl = dnnl_runtime_ocl;
 const runtime_kind_t sycl = dnnl_runtime_sycl;
+const runtime_kind_t l0 = dnnl_runtime_l0;
 } // namespace runtime_kind
 
 using primitive_kind_t = dnnl_primitive_kind_t;
diff --git a/src/common/engine.cpp b/src/common/engine.cpp
index a2f5edae841..647a2b34f74 100644
--- a/src/common/engine.cpp
+++ b/src/common/engine.cpp
@@ -29,12 +29,16 @@
 #include "cpu/cpu_engine.hpp"
 #endif
 
+#ifdef DNNL_WITH_SYCL
+#include "xpu/sycl/engine_factory.hpp"
+#endif
+
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "xpu/ocl/engine_factory.hpp"
 #endif
 
-#ifdef DNNL_WITH_SYCL
-#include "xpu/sycl/engine_factory.hpp"
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "gpu/intel/l0/engine_factory.hpp"
 #endif
 
 namespace dnnl {
@@ -42,23 +46,27 @@ namespace impl {
 
 static inline std::unique_ptr<engine_factory_t> get_engine_factory(
         engine_kind_t kind, runtime_kind_t runtime_kind) {
-
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
     if (kind == engine_kind::cpu && is_native_runtime(runtime_kind)) {
         return std::unique_ptr<engine_factory_t>(
                 new cpu::cpu_engine_factory_t());
     }
 #endif
-
+#ifdef DNNL_WITH_SYCL
+    if (runtime_kind == runtime_kind::sycl) {
+        return xpu::sycl::get_engine_factory(kind);
+    }
+#endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     if (kind == engine_kind::gpu && runtime_kind == runtime_kind::ocl) {
         return std::unique_ptr<engine_factory_t>(
                 new xpu::ocl::engine_factory_t(kind));
     }
 #endif
-#ifdef DNNL_WITH_SYCL
-    if (runtime_kind == runtime_kind::sycl)
-        return xpu::sycl::get_engine_factory(kind);
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+    if (kind == engine_kind::gpu && runtime_kind == runtime_kind::l0) {
+        return gpu::intel::l0::get_engine_factory(kind);
+    }
 #endif
     return nullptr;
 }
diff --git a/src/common/engine.hpp b/src/common/engine.hpp
index 3fa59ce22b2..330df8d5370 100644
--- a/src/common/engine.hpp
+++ b/src/common/engine.hpp
@@ -189,6 +189,8 @@ inline runtime_kind_t get_default_runtime(engine_kind_t kind) {
     if (kind == engine_kind::gpu) return runtime_kind::ocl;
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
     if (kind == engine_kind::gpu) return runtime_kind::sycl;
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+    if (kind == engine_kind::gpu) return runtime_kind::l0;
 #endif
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SEQ
     return runtime_kind::seq;
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index 3d5a3d2e25f..aaab751b31d 100644
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -291,7 +291,8 @@ std::string get_jit_profiling_jitdumpdir() {
 
 bool is_destroying_cache_safe() {
 #if defined(_WIN32) \
-        && (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
+        && (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 \
+                || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
     // The ntdll.dll library is located in system32, therefore setting
     // additional environment is not required.
     HMODULE handle = LoadLibraryExA(
diff --git a/src/gpu/intel/CMakeLists.txt b/src/gpu/intel/CMakeLists.txt
index 61ae11dfcf2..52feedc6b67 100644
--- a/src/gpu/intel/CMakeLists.txt
+++ b/src/gpu/intel/CMakeLists.txt
@@ -58,9 +58,12 @@ add_subdirectory(jit)
 
 if(DNNL_GPU_RUNTIME STREQUAL "OCL")
     add_subdirectory(ocl)
+elseif(DNNL_GPU_RUNTIME STREQUAL "L0")
+    add_subdirectory(l0)
 elseif(DNNL_WITH_SYCL)
     add_subdirectory(sycl)
     add_subdirectory(ocl)
+    add_subdirectory(l0/utils)
 endif()
 
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel)
diff --git a/src/gpu/intel/compute/ukernels.cpp b/src/gpu/intel/compute/ukernels.cpp
index ae0947c5752..98f03b3d6d1 100644
--- a/src/gpu/intel/compute/ukernels.cpp
+++ b/src/gpu/intel/compute/ukernels.cpp
@@ -16,14 +16,19 @@
 
 #include "gpu/intel/compute/ukernels.hpp"
 
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#include "gpu/intel/sycl/engine.hpp"
+#include "gpu/intel/sycl/utils.hpp"
+#endif
+
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "gpu/intel/ocl/engine.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #endif
 
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
-#include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/utils.hpp"
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
 #endif
 
 namespace dnnl {
@@ -51,6 +56,11 @@ bool mayiuse_microkernels(const engine_t *engine) {
 
     auto mayiuse_mk = [](const engine_t *engine) {
         switch (engine->runtime_kind()) {
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+            case runtime_kind::sycl:
+                return sycl::mayiuse_microkernels(
+                        utils::downcast<const sycl::engine_t *>(engine));
+#endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
             case runtime_kind::ocl: {
                 auto *ocl_engine
@@ -60,10 +70,10 @@ bool mayiuse_microkernels(const engine_t *engine) {
                         cl_microkernels_check_kernel_code);
             }
 #endif
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
-            case runtime_kind::sycl:
-                return sycl::mayiuse_microkernels(
-                        utils::downcast<const sycl::engine_t *>(engine));
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+            case runtime_kind::l0:
+                return utils::downcast<const l0::engine_t *>(engine)
+                        ->mayiuse_microkernels();
 #endif
             default: return false;
         }
diff --git a/src/gpu/intel/compute/utils.cpp b/src/gpu/intel/compute/utils.cpp
new file mode 100644
index 00000000000..27fddb7154e
--- /dev/null
+++ b/src/gpu/intel/compute/utils.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+* Copyright 2019 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/compute/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace compute {
+
+status_t preprocess_headers(stringstream_t &pp_code, const char *code,
+        const compute::kernel_ctx_t &kernel_ctx) {
+    stringstream_t code_stream(code);
+
+    for (std::string line; std::getline(code_stream, line);) {
+        const size_t include_pos = line.find("#include");
+        if (include_pos != std::string::npos) {
+            static constexpr size_t include_len = 8;
+            const size_t first_quote_pos
+                    = line.find("\"", include_pos + include_len);
+            const size_t second_quote_pos
+                    = line.find("\"", first_quote_pos + 1);
+            const size_t kernel_name_len
+                    = second_quote_pos - first_quote_pos - 1;
+            const auto header_name
+                    = line.substr(first_quote_pos + 1, kernel_name_len);
+            const char *header_source
+                    = kernel_ctx.get_custom_header(header_name);
+            if (!header_source) header_source = get_kernel_header(header_name);
+            CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
+        } else {
+            pp_code << line << std::endl;
+        }
+    }
+    return status::success;
+}
+
+void debugdump_processed_source(const std::string &source,
+        const std::string &options, const std::string &cl_options) {
+#if defined(__linux__) && defined(DNNL_DEV_MODE)
+    if (get_verbose(verbose_t::debuginfo) >= 10) {
+        auto get_defines = [](const std::string &from) {
+            std::string ret;
+            size_t pos = 0;
+            while (pos < from.length()) {
+                // Find next define argument
+                pos = from.find("-D", pos);
+
+                // Generate argument, quotes are interpreted literally, but
+                // other special shell characters need escaped. Does not
+                // currently handle quotes with the ' character or nested quotes
+                char quote_parity = true;
+                while (pos < from.length()) {
+                    if (quote_parity
+                            && utils::one_of(from[pos], '~', '#', '$', '&', '*',
+                                    '(', ')', '\\', '|', '[', ']', '{', '}',
+                                    ';', '\'', '<', '>', '/', '?', '!')) {
+                        ret += '\\';
+                    }
+                    ret += from[pos];
+                    if (from[pos] == '"') quote_parity ^= true;
+                    if (from[pos] == ' ' && quote_parity) break;
+
+                    pos++;
+                }
+            }
+            return ret;
+        };
+        auto execute_command
+                = [](const std::string &cmd, const std::string &stdin) {
+            std::string result;
+            std::array<char, 256> buffer;
+            FILE *pipe = popen(cmd.c_str(), "w");
+            fputs(stdin.c_str(), pipe);
+            if (pipe) {
+                while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+                    result += buffer.data();
+                }
+            }
+            pclose(pipe);
+            return result;
+        };
+
+        // Run utilities to evaluate preprocessor defines and format the file
+        // Theoretically, we can accomplish this task with libclang, but it
+        // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
+        // so that calls to the system are not included in the default build.
+
+        // Due to the use of a different C preprocessor, warnings should not be
+        // ignored, as they may correspond to a different behavior in the OpenCL
+        // C preprocessor
+        auto o = get_defines(options) + get_defines(cl_options);
+        std::string preprocess_cmd
+                = std::string() + "cpp -P " + o + " | clang-format";
+        execute_command(preprocess_cmd, source);
+        std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
+    }
+#endif
+}
+
+} // namespace compute
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/compute/utils.hpp b/src/gpu/intel/compute/utils.hpp
index 564308ad0b3..dc29eb20985 100644
--- a/src/gpu/intel/compute/utils.hpp
+++ b/src/gpu/intel/compute/utils.hpp
@@ -17,14 +17,8 @@
 #ifndef GPU_INTEL_COMPUTE_UTILS_HPP
 #define GPU_INTEL_COMPUTE_UTILS_HPP
 
-#include <array>
-#include <cassert>
-#include <sstream>
-#include <tuple>
-#include <vector>
-
 #include "common/utils.hpp"
-#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/utils.hpp"
 
 namespace dnnl {
@@ -155,6 +149,12 @@ class nd_range_t {
     range_t local_range_;
 };
 
+status_t preprocess_headers(stringstream_t &pp_code, const char *code,
+        const compute::kernel_ctx_t &kernel_ctx);
+
+void debugdump_processed_source(const std::string &source,
+        const std::string &options, const std::string &ocl_options);
+
 } // namespace compute
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp b/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp
index f1e90255b48..bfca1e9e4fa 100644
--- a/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp
+++ b/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp
@@ -40,6 +40,9 @@
 #ifdef GEMMSTONE_WITH_OPENCL_RUNTIME
 #include "ngen_opencl.hpp"
 #endif
+#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME
+#include "ngen_level_zero.hpp"
+#endif
 
 GEMMSTONE_NAMESPACE_START
 namespace dsl {
@@ -1862,6 +1865,54 @@ cl_kernel make_kernel(
     return {};
 }
 #endif
+#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME
+template <ngen::HW hw>
+using l0_gen_t = ir::ir_to_ngen_generator_t<ngen::LevelZeroCodeGenerator<hw>>;
+GEMMSTONE_XELP_ISA(
+        template void ir::convert_ir_to_ngen<l0_gen_t<ngen::HW::XeLP>>(
+                const stmt_t &body, l0_gen_t<ngen::HW::XeLP> &host,
+                const walk_order_t *kernel_grid_walk_order));
+GEMMSTONE_XEHP_ISA(
+        template void ir::convert_ir_to_ngen<l0_gen_t<ngen::HW::XeHP>>(
+                const stmt_t &body, l0_gen_t<ngen::HW::XeHP> &host,
+                const walk_order_t *kernel_grid_walk_order));
+GEMMSTONE_XEHPG_ISA(
+        template void ir::convert_ir_to_ngen<l0_gen_t<ngen::HW::XeHPG>>(
+                const stmt_t &body, l0_gen_t<ngen::HW::XeHPG> &host,
+                const walk_order_t *kernel_grid_walk_order));
+GEMMSTONE_XEHPC_ISA(
+        template void ir::convert_ir_to_ngen<l0_gen_t<ngen::HW::XeHPC>>(
+                const stmt_t &body, l0_gen_t<ngen::HW::XeHPC> &host,
+                const walk_order_t *kernel_grid_walk_order));
+GEMMSTONE_XE2_ISA(template void ir::convert_ir_to_ngen<l0_gen_t<ngen::HW::Xe2>>(
+        const stmt_t &body, l0_gen_t<ngen::HW::Xe2> &host,
+        const walk_order_t *kernel_grid_walk_order));
+GEMMSTONE_XE3_ISA(template void ir::convert_ir_to_ngen<l0_gen_t<ngen::HW::Xe3>>(
+        const stmt_t &body, l0_gen_t<ngen::HW::Xe3> &host,
+        const walk_order_t *kernel_grid_walk_order));
+
+std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
+        const kernel_t &ir_kernel, ze_context_handle_t ctx,
+        ze_device_handle_t dev) {
+    auto &iface = ir_kernel.iface;
+    auto &options = ir_kernel.options;
+    auto &body = ir_kernel.body;
+    auto &debug_cfg = ir_kernel.debug_cfg;
+
+    ngen::NEOInterfaceHandler interface = generate_ngen_interface(
+            iface, options, body);
+
+#define GPU_HW_CASE(hw) \
+    l0_gen_t<(hw)> g(iface, options, debug_cfg); \
+    g.setInterface(std::move(interface)); \
+    convert_ir_to_ngen(body, g); \
+    return g.getKernel(ctx, dev);
+
+    GPU_HW_SWITCH(options.hw().ngen_hw());
+#undef GPU_HW_CASE
+    return {};
+}
+#endif
 
 } // namespace dsl
 GEMMSTONE_NAMESPACE_END
diff --git a/src/gpu/intel/gemm/jit/dsl/runtime.cpp b/src/gpu/intel/gemm/jit/dsl/runtime.cpp
index cc1f5c35587..3fdc564b285 100644
--- a/src/gpu/intel/gemm/jit/dsl/runtime.cpp
+++ b/src/gpu/intel/gemm/jit/dsl/runtime.cpp
@@ -154,4 +154,18 @@ cl_kernel make_kernel(
 }
 #endif
 
+#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME
+std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
+        const GEMMKernelDesc &desc, ze_context_handle_t context,
+        ze_device_handle_t device) {
+    if (desc.strategy.isDSLGenerator) {
+        generator_dsl_desc_t dsl_desc(
+                desc.problem, desc.strategy, desc.iface, desc.options);
+        auto dsl_kernel = make_kernel(dsl_desc);
+        return dsl::make_kernel(dsl_kernel, context, device);
+    }
+    stub();
+}
+#endif
+
 GEMMSTONE_NAMESPACE_END
diff --git a/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp b/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp
index ff70527a6d8..7ba9db8e24a 100644
--- a/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp
+++ b/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp
@@ -48,6 +48,11 @@ ::sycl::kernel make_kernel(
 #ifdef GEMMSTONE_WITH_OPENCL_RUNTIME
 cl_kernel make_kernel(const kernel_t &kernel, cl_context ctx, cl_device_id dev);
 #endif
+#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME
+std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
+        const kernel_t &kernel, ze_context_handle_t ctx,
+        ze_device_handle_t dev);
+#endif
 
 } // namespace dsl
 GEMMSTONE_NAMESPACE_END
diff --git a/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp b/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp
index 4803dcec586..4576dccac05 100644
--- a/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp
+++ b/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp
@@ -46,7 +46,15 @@ GEMMSTONE_NAMESPACE_START
 #ifndef GENERATOR_BASE
 
 #define GENERATOR_SUPER(hw) ngen::OpenCLCodeGenerator<hw>
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
+#endif
 #define GENERATOR_DEBUGINFO {__FILE__, __LINE__}
 
 #define GENERATOR_BASE(hw) GENERATOR_SUPER(hw)
diff --git a/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp b/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp
index 4d30d8b12d3..760a7eb3127 100644
--- a/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp
+++ b/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp
@@ -70,5 +70,10 @@ std::vector<uint8_t> make_binary(const GEMMKernelDesc &desc, cl_device_id device
 cl_kernel make_kernel(const GEMMKernelDesc &desc, cl_device_id device, cl_context context);
 #endif
 
+#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME
+std::vector<uint8_t> make_binary(const GEMMKernelDesc &desc, ze_device_handle_t device, ze_context_handle_t context);
+std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(const GEMMKernelDesc &desc, ze_device_handle_t device, ze_context_handle_t context);
+#endif
+
 GEMMSTONE_NAMESPACE_END
 #endif
diff --git a/src/gpu/intel/jit/binary_format.cpp b/src/gpu/intel/jit/binary_format.cpp
index 6460b2e8261..62b03b78afe 100644
--- a/src/gpu/intel/jit/binary_format.cpp
+++ b/src/gpu/intel/jit/binary_format.cpp
@@ -39,6 +39,16 @@
 #define MAGICSIZEY 2
 #define MAGICSIZEZ 1
 
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -49,7 +59,7 @@ using namespace ngen;
 
 template <HW hw>
 class binary_format_kernel_t : public generator_t<hw> {
-    NGEN_FORWARD_OPENCL(hw);
+    FORWARD(hw);
 
 public:
     binary_format_kernel_t()
diff --git a/src/gpu/intel/jit/config/gemmstone_config.hpp b/src/gpu/intel/jit/config/gemmstone_config.hpp
index 30b2373eaef..6c96640f44f 100644
--- a/src/gpu/intel/jit/config/gemmstone_config.hpp
+++ b/src/gpu/intel/jit/config/gemmstone_config.hpp
@@ -40,6 +40,9 @@
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #define OPENCL_OUTPUT
 #define GEMMSTONE_WITH_OPENCL_RUNTIME
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#define ZEBIN_OUTPUT
+#define GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME
 #endif
 
 #if !defined(NDEBUG) || defined(DNNL_DEV_MODE)
diff --git a/src/gpu/intel/jit/generator.hpp b/src/gpu/intel/jit/generator.hpp
index 778db6d2515..58cab363db6 100644
--- a/src/gpu/intel/jit/generator.hpp
+++ b/src/gpu/intel/jit/generator.hpp
@@ -43,6 +43,12 @@
 #include "ngen_opencl.hpp"
 #endif
 
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/kernel.hpp"
+#include "ngen_level_zero.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -90,6 +96,11 @@ template <gpu_gen_t hw>
 using ngen_code_generator_t = ngen::OpenCLCodeGenerator<hw>;
 #endif
 
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+template <gpu_gen_t hw>
+using ngen_code_generator_t = ngen::LevelZeroCodeGenerator<hw>;
+#endif
+
 void check_kernel_size(const std::string &kernel_name, size_t kernel_size,
         const intel::engine_t *engine);
 
@@ -125,6 +136,15 @@ class generator_t : public ngen_code_generator_t<hw>, public generator_base_t {
         auto ocl_kernel = ngen_code_generator_t<hw>::getKernel(
                 ocl_engine->context(), ocl_engine->device());
         return ocl::kernel_t::make(kernel, ocl_kernel, {});
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+        auto *l0_engine = utils::downcast<const l0::engine_t *>(engine);
+        auto l0_module_kernel = ngen_code_generator_t<hw>::getKernel(
+                l0_engine->context(), l0_engine->device());
+        auto l0_module = std::make_shared<l0::module_wrapper_t>(
+                l0_module_kernel.first);
+        return l0::kernel_t::make(
+                kernel, l0_module, l0_module_kernel.second, kernel_name());
 #endif
     }
 };
diff --git a/src/gpu/intel/l0/CMakeLists.txt b/src/gpu/intel/l0/CMakeLists.txt
new file mode 100644
index 00000000000..76df8e68982
--- /dev/null
+++ b/src/gpu/intel/l0/CMakeLists.txt
@@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+
+set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
diff --git a/src/gpu/intel/l0/capi/engine.cpp b/src/gpu/intel/l0/capi/engine.cpp
new file mode 100644
index 00000000000..907c3dbbb51
--- /dev/null
+++ b/src/gpu/intel/l0/capi/engine.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/utils.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/engine_factory.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
+        const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
+        const ze_context_handle_t acontext) {
+    bool args_ok = !utils::any_null(engine, adriver, adevice, acontext);
+    if (!args_ok) return status::invalid_arguments;
+
+    gpu::intel::l0::engine_factory_t f(engine_kind::gpu);
+
+    size_t index;
+    CHECK(gpu::intel::l0::get_device_index(adevice, &index));
+
+    return f.engine_create(engine, adriver, adevice, acontext, index);
+}
+
+dnnl_status_t dnnl_l0_interop_engine_get_context(
+        dnnl_engine_t engine, ze_context_handle_t context) {
+    bool args_ok = !utils::any_null(engine, context)
+            && (engine->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    context = l0_engine->context();
+
+    return status::success;
+}
+
+dnnl_status_t dnnl_l0_interop_engine_get_device(
+        dnnl_engine_t engine, ze_device_handle_t device) {
+    bool args_ok = !utils::any_null(engine, device)
+            && (engine->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    device = l0_engine->device();
+
+    return status::success;
+}
+
+dnnl_status_t dnnl_l0_interop_engine_get_driver(
+        dnnl_engine_t engine, ze_driver_handle_t driver) {
+    bool args_ok = !utils::any_null(engine, driver)
+            && (engine->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    driver = l0_engine->driver();
+
+    return status::success;
+}
diff --git a/src/gpu/intel/l0/capi/memory.cpp b/src/gpu/intel/l0/capi/memory.cpp
new file mode 100644
index 00000000000..5eacebb6a3d
--- /dev/null
+++ b/src/gpu/intel/l0/capi/memory.cpp
@@ -0,0 +1,128 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/utils.hpp"
+#include "gpu/intel/l0/memory_storage.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        void *handle) {
+    bool ok = !utils::any_null(memory, memory_desc, engine)
+            && engine->runtime_kind() == runtime_kind::l0;
+    if (!ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto kind = gpu::intel::l0::get_memory_storage_kind(
+            gpu::intel::l0::get_pointer_type(l0_engine->context(), handle));
+    if (handle != DNNL_MEMORY_NONE && handle != DNNL_MEMORY_ALLOCATE
+            && kind == gpu::intel::l0::memory_storage_kind_t::unknown
+            && !engine->mayiuse_system_memory_allocators())
+        return status::invalid_arguments;
+
+    const auto mdw = memory_desc_wrapper(memory_desc);
+    if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
+        return status::invalid_arguments;
+
+    unsigned flags = (handle == DNNL_MEMORY_ALLOCATE)
+            ? memory_flags_t::alloc
+            : memory_flags_t::use_runtime_ptr;
+    handle = (handle == DNNL_MEMORY_ALLOCATE) ? nullptr : handle;
+
+    std::unique_ptr<memory_storage_t> mem_storage;
+    mem_storage.reset(new gpu::intel::l0::memory_storage_t(
+            engine, gpu::intel::l0::memory_storage_kind_t::device));
+    if (!mem_storage) return status::out_of_memory;
+
+    CHECK(mem_storage->init(
+            flags, dnnl_memory_desc_get_size(memory_desc), handle));
+
+    return safe_ptr_assign(
+            *memory, new memory_t(engine, memory_desc, std::move(mem_storage)));
+}
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        int nhandles, void **handles) {
+    bool ok = !utils::any_null(memory, memory_desc, engine, handles)
+            && nhandles > 0 && engine->runtime_kind() == runtime_kind::l0;
+    if (!ok) return status::invalid_arguments;
+
+    const auto mdw = memory_desc_wrapper(memory_desc);
+    if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
+        return status::invalid_arguments;
+
+    std::vector<unsigned> flags_vec(nhandles);
+    std::vector<void *> handles_vec(nhandles);
+    for (int i = 0; i < nhandles; i++) {
+        unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE)
+                ? memory_flags_t::alloc
+                : memory_flags_t::use_runtime_ptr;
+        void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i];
+        flags_vec[i] = f;
+        handles_vec[i] = h;
+    }
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    std::vector<std::unique_ptr<memory_storage_t>> mem_storages(nhandles);
+    for (int i = 0; i < nhandles; i++) {
+        auto kind = gpu::intel::l0::get_memory_storage_kind(
+                gpu::intel::l0::get_pointer_type(
+                        l0_engine->context(), handles[i]));
+        if (handles[i] != DNNL_MEMORY_NONE && handles[i] != DNNL_MEMORY_ALLOCATE
+                && kind == gpu::intel::l0::memory_storage_kind_t::unknown
+                && !engine->mayiuse_system_memory_allocators()) {
+            return status::invalid_arguments;
+        }
+        size_t sz = dnnl_memory_desc_get_size_v2(memory_desc, i);
+        mem_storages[i].reset(new gpu::intel::l0::memory_storage_t(
+                engine, gpu::intel::l0::memory_storage_kind_t::device));
+        if (!mem_storages[i]) return status::out_of_memory;
+        CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
+    }
+
+    return safe_ptr_assign(*memory,
+            new memory_t(engine, memory_desc, std::move(mem_storages)));
+}
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
+        const memory_t *memory, void **mem_object) {
+    if (utils::any_null(mem_object)) return status::invalid_arguments;
+
+    if (!memory) {
+        mem_object = nullptr;
+        return status::success;
+    }
+    bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    void *handle;
+    status_t status = memory->get_data_handle(&handle);
+    if (status == status::success) mem_object = &handle;
+
+    return status;
+}
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
+        memory_t *memory, void *mem_object) {
+    bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    return memory->set_data_handle(mem_object);
+}
diff --git a/src/gpu/intel/l0/capi/primitive.cpp b/src/gpu/intel/l0/capi/primitive.cpp
new file mode 100644
index 00000000000..e48ab655d1d
--- /dev/null
+++ b/src/gpu/intel/l0/capi/primitive.cpp
@@ -0,0 +1,67 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/primitive_desc_iface.hpp"
+#include "common/primitive_iface.hpp"
+#include "common/utils.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t dnnl_l0_interop_primitive_execute(
+        const primitive_iface_t *primitive_iface, dnnl_stream_t stream,
+        int nargs, const dnnl_exec_arg_t *args, int ndeps,
+        const ze_event_handle_t *deps, ze_event_handle_t *return_event) {
+    const bool ok = !utils::any_null(primitive_iface, stream)
+            && primitive_iface->engine() == stream->engine()
+            && primitive_iface->engine()->runtime_kind() == runtime_kind::l0
+            && IMPLICATION(nargs > 0, args != nullptr)
+            && IMPLICATION(ndeps > 0, deps != nullptr);
+    if (!ok) return status::invalid_arguments;
+
+    auto *l0_stream = utils::downcast<gpu::intel::l0::stream_t *>(stream);
+    stream->before_exec_hook();
+
+    if (deps != nullptr) {
+        std::vector<ze_event_handle_t> events(ndeps);
+        for (int i = 0; i < ndeps; i++)
+            events[i] = deps[i];
+        l0_stream->l0_ctx().set_deps(events);
+    }
+
+    // run primitive
+    exec_args_t exec_args;
+    CHECK(cvt_primitive_args(primitive_iface->pd()->impl().get(),
+            static_cast<int>(nargs), args, exec_args));
+
+    exec_ctx_t ctx(stream, std::move(exec_args));
+    CHECK(primitive_execute(primitive_iface, ctx));
+
+    // return output event
+    if (return_event != nullptr) {
+        if (l0_stream->impl()->flags() & stream_flags::in_order) {
+            *return_event = nullptr;
+        } else {
+            *return_event = l0_stream->get_output_event();
+        }
+    }
+
+    stream->after_exec_hook();
+
+    return status::success;
+}
diff --git a/src/gpu/intel/l0/capi/stream.cpp b/src/gpu/intel/l0/capi/stream.cpp
new file mode 100644
index 00000000000..44a62a29d87
--- /dev/null
+++ b/src/gpu/intel/l0/capi/stream.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/utils.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
+        dnnl_engine_t engine, ze_command_list_handle_t list, bool profiling) {
+    bool args_ok = !utils::any_null(stream, engine, list)
+            && engine->runtime_kind() == runtime_kind::l0;
+    if (!args_ok) return status::invalid_arguments;
+
+    unsigned flags = stream_flags::default_flags;
+    if (profiling) {
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+        flags |= stream_flags::profiling;
+#endif
+    }
+
+    std::unique_ptr<stream_impl_t> stream_impl(
+            new gpu::intel::l0::stream_impl_t(flags, list));
+    if (!stream_impl) return status::out_of_memory;
+
+    CHECK(engine->create_stream(stream, stream_impl.get()));
+    stream_impl.release();
+
+    return status::success;
+}
+
+dnnl_status_t dnnl_l0_interop_stream_get_list(
+        dnnl_stream_t stream, ze_command_list_handle_t list) {
+    bool args_ok = !utils::any_null(list, stream)
+            && stream->engine()->runtime_kind() == runtime_kind::l0;
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_stream = utils::downcast<const gpu::intel::l0::stream_t *>(stream);
+    list = l0_stream->list();
+
+    return status::success;
+}
diff --git a/src/gpu/intel/l0/compiler.hpp b/src/gpu/intel/l0/compiler.hpp
new file mode 100644
index 00000000000..f49659cdca3
--- /dev/null
+++ b/src/gpu/intel/l0/compiler.hpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_COMPILER_HPP
+#define GPU_INTEL_L0_COMPILER_HPP
+
+#include "xpu/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+static inline cl_program ocl_compile(const cl_device_id device,
+        const cl_context context, const char *kernel_code,
+        const char *options) {
+    cl_int err;
+    cl_program program = xpu::ocl::clCreateProgramWithSource(
+            context, 1, &kernel_code, nullptr, &err);
+    if (err != CL_SUCCESS) return nullptr;
+    err = xpu::ocl::clBuildProgram(
+            program, 1, &device, options, nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+        xpu::ocl::clReleaseProgram(program);
+        return nullptr;
+    }
+    return program;
+}
+
+inline bool ocl_mayiuse_microkernels(const cl_device_id device,
+        const cl_context context, const char *kernel_code) {
+    cl_program program = ocl_compile(device, context, kernel_code, "");
+    if (program) {
+        xpu::ocl::clReleaseProgram(program);
+        return true;
+    }
+    return false;
+}
+
+inline status_t ocl_build_kernels(const cl_device_id device,
+        const cl_context context, const char *kernel_code, const char *options,
+        xpu::binary_t &binary) {
+    cl_program program = ocl_compile(device, context, kernel_code, options);
+    if (!program) return status::runtime_error;
+
+    size_t binary_size = 0;
+    OCL_CHECK(xpu::ocl::clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+            sizeof(binary_size), &binary_size, nullptr));
+
+    binary.resize(binary_size);
+    auto binary_data = binary.data();
+    OCL_CHECK(xpu::ocl::clGetProgramInfo(program, CL_PROGRAM_BINARIES,
+            sizeof(binary_data), &binary_data, nullptr));
+
+    OCL_CHECK(xpu::ocl::clReleaseProgram(program));
+
+    return status::success;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_COMPILER_HPP
diff --git a/src/gpu/intel/l0/context.hpp b/src/gpu/intel/l0/context.hpp
new file mode 100644
index 00000000000..902fc4ff778
--- /dev/null
+++ b/src/gpu/intel/l0/context.hpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_CONTEXT_HPP
+#define GPU_INTEL_L0_CONTEXT_HPP
+
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "xpu/context.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+struct event_t : public xpu::event_t {
+    event_t() = default;
+    event_t(const event_t &) = default;
+    event_t(const std::vector<ze_event_handle_t> &event) : events_(event) {}
+    event_t(std::vector<ze_event_handle_t> &&event)
+        : events_(std::move(event)) {}
+    event_t(ze_event_handle_t &&event) {
+        events_.emplace_back(std::move(event));
+    }
+    ~event_t() override = default;
+
+    event_t &operator=(event_t &&other) {
+        std::swap(events_, other.events_);
+        return *this;
+    }
+    event_t &operator=(const event_t &other) {
+        events_ = other.events_;
+        return *this;
+    }
+
+    const ze_event_handle_t &operator[](size_t i) const { return events_[i]; }
+    ze_event_handle_t &operator[](size_t i) { return events_[i]; }
+    size_t size() const { return events_.size(); }
+
+    static event_t &from(xpu::event_t &event) {
+        return *utils::downcast<event_t *>(&event);
+    }
+    static const event_t &from(const xpu::event_t &event) {
+        return *utils::downcast<const event_t *>(&event);
+    }
+    std::unique_ptr<xpu::event_t> clone() const override {
+        return std::unique_ptr<xpu::event_t>(new event_t(*this));
+    }
+    void append(const xpu::event_t &event) {
+        auto &other = *utils::downcast<const event_t *>(&event);
+        events_.insert(
+                events_.end(), other.events_.begin(), other.events_.end());
+    }
+
+    std::vector<ze_event_handle_t> events_;
+};
+
+class context_t final : public xpu::context_t {
+public:
+    context_t() = default;
+    ~context_t() override = default;
+
+    context_t &operator=(const context_t &other) {
+        events_ = other.events_;
+        return *this;
+    }
+    void set_deps(std::vector<ze_event_handle_t> &&event) {
+        events_ = event_t(event);
+    }
+    void set_deps(event_t &&events) { events_ = std::move(events); }
+
+    xpu::event_t &get_deps() override { return events_; }
+    const xpu::event_t &get_deps() const override { return events_; }
+    void append_deps(const xpu::event_t &event) override {
+        events_.append(event);
+    }
+
+    status_t get_event(ze_event_handle_t *new_event);
+
+private:
+    event_t events_;
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_CONTEXT_HPP
diff --git a/src/gpu/intel/l0/device_info.cpp b/src/gpu/intel/l0/device_info.cpp
new file mode 100644
index 00000000000..42b9e042363
--- /dev/null
+++ b/src/gpu/intel/l0/device_info.cpp
@@ -0,0 +1,148 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/device_info.hpp"
+#include "gpu/intel/l0/compiler.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "ngen_level_zero.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t device_info_t::init_arch(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto context = l0_engine->context();
+    auto device = l0_engine->device();
+
+    return init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
+            gpu_product_, native_extensions_, mayiuse_systolic_,
+            mayiuse_ngen_kernels_);
+}
+
+status_t device_info_t::init_device_name(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto device = l0_engine->device();
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = nullptr;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+    name_ = std::string(device_properties.name);
+
+    return status::success;
+}
+
+status_t device_info_t::init_runtime_version(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto driver = l0_engine->driver();
+
+    ze_driver_properties_t driver_properties = {};
+    driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
+    driver_properties.pNext = nullptr;
+
+    l0::zeDriverGetProperties(driver, &driver_properties);
+
+    runtime_version_.major
+            = (driver_properties.driverVersion & 0xFF000000) >> 24;
+    runtime_version_.minor
+            = (driver_properties.driverVersion & 0x00FF0000) >> 16;
+    runtime_version_.build = driver_properties.driverVersion & 0x0000FFFF;
+
+    return status::success;
+}
+
+status_t device_info_t::init_extensions(impl::engine_t *engine) {
+    std::string extension_string;
+    // TODO: using OpenCL runtime becuse Level Zero runtime does not provide
+    // this information.
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    CHECK(xpu::ocl::get_extensions(l0_engine->ocl_device(), extension_string));
+
+    for (uint64_t i_ext = 1; i_ext < (uint64_t)compute::device_ext_t::last;
+            i_ext <<= 1) {
+        const char *s_ext = ext2cl_str((compute::device_ext_t)i_ext);
+
+        if (s_ext && extension_string.find(s_ext) != std::string::npos) {
+            extensions_ |= i_ext;
+        }
+    }
+
+    extensions_
+            |= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic());
+
+    return status::success;
+}
+
+status_t device_info_t::init_attributes(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto device = l0_engine->device();
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = nullptr;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+
+    eu_count_ = device_properties.numSlices
+            * device_properties.numSubslicesPerSlice
+            * device_properties.numEUsPerSubslice;
+
+    ze_device_compute_properties_t device_compute_properties = {};
+    device_compute_properties.stype
+            = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+    device_compute_properties.pNext = nullptr;
+
+    CHECK(l0::zeDeviceGetComputeProperties(device, &device_compute_properties));
+
+    max_wg_size_ = device_compute_properties.maxTotalGroupSize;
+
+    uint32_t device_cache_properties_count = 0;
+    CHECK(l0::zeDeviceGetCacheProperties(
+            device, &device_cache_properties_count, nullptr));
+
+    std::vector<ze_device_cache_properties_t> device_cache_properties(
+            device_cache_properties_count);
+    for (ze_device_cache_properties_t &p : device_cache_properties) {
+        p.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+        p.pNext = nullptr;
+    }
+
+    CHECK(l0::zeDeviceGetCacheProperties(device, &device_cache_properties_count,
+            device_cache_properties.data()));
+    l3_cache_size_ = device_cache_properties[0].cacheSize;
+
+    ze_device_memory_access_properties_t device_memory_access_properties = {};
+    device_memory_access_properties.stype
+            = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
+    device_memory_access_properties.pNext = nullptr;
+
+    l0::zeDeviceGetMemoryAccessProperties(
+            device, &device_memory_access_properties);
+    mayiuse_system_memory_allocators_
+            = device_memory_access_properties.sharedSystemAllocCapabilities;
+
+    return status::success;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/device_info.hpp b/src/gpu/intel/l0/device_info.hpp
new file mode 100644
index 00000000000..607ab798a27
--- /dev/null
+++ b/src/gpu/intel/l0/device_info.hpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_DEVICE_INFO_HPP
+#define GPU_INTEL_L0_DEVICE_INFO_HPP
+
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class device_info_t : public compute::device_info_t {
+protected:
+    status_t init_device_name(impl::engine_t *engine) override;
+    status_t init_arch(impl::engine_t *engine) override;
+    status_t init_runtime_version(impl::engine_t *engine) override;
+    status_t init_extensions(impl::engine_t *engine) override;
+    status_t init_attributes(impl::engine_t *engine) override;
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_DEVICE_INFO_HPP
diff --git a/src/gpu/intel/l0/engine.cpp b/src/gpu/intel/l0/engine.cpp
new file mode 100644
index 00000000000..043664e132a
--- /dev/null
+++ b/src/gpu/intel/l0/engine.cpp
@@ -0,0 +1,325 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/compiler.hpp"
+#include "gpu/intel/l0/device_info.hpp"
+#include "gpu/intel/l0/kernel.hpp"
+#include "gpu/intel/l0/memory_storage.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+#include "gemmstone/dsl/runtime.hpp"
+#include "gpu/intel/compute/ukernels.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "gpu/intel/microkernels/fuser.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class engine_impl_t : public impl::engine_impl_t {
+public:
+    engine_impl_t(engine_kind_t kind, const ze_driver_handle_t driver,
+            const ze_device_handle_t device, const ze_context_handle_t context,
+            size_t index)
+        : impl::engine_impl_t(kind, runtime_kind::l0, index)
+        , driver_(driver)
+        , device_(device)
+        , context_(context) {
+        cl_int err;
+        std::vector<cl_device_id> ocl_devices;
+        xpu::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU);
+
+        ocl_device_ = nullptr;
+        ocl_context_ = nullptr;
+        xpu::device_uuid_t l0_dev_uuid = get_device_uuid(device);
+        for (const cl_device_id &d : ocl_devices) {
+            xpu::device_uuid_t ocl_dev_uuid;
+            xpu::ocl::get_device_uuid(ocl_dev_uuid, d);
+            if (l0_dev_uuid == ocl_dev_uuid) {
+                ocl_device_ = xpu::ocl::make_wrapper(d);
+                ocl_context_ = xpu::ocl::make_wrapper(
+                        xpu::ocl::clCreateContext(nullptr, 1,
+                                &ocl_device_.unwrap(), nullptr, nullptr, &err));
+            }
+        }
+    }
+    ~engine_impl_t() override { l0::zeContextDestroy(context_); }
+
+    const ze_driver_handle_t driver() const { return driver_; }
+    const ze_device_handle_t device() const { return device_; }
+    const ze_context_handle_t context() const { return context_; }
+
+    const xpu::ocl::wrapper_t<cl_device_id> ocl_device() const {
+        return ocl_device_;
+    }
+    const xpu::ocl::wrapper_t<cl_context> ocl_context() const {
+        return ocl_context_;
+    }
+
+    status_t create_stream_impl(
+            impl::stream_impl_t **stream_impl, unsigned flags) const override {
+        auto *si = new stream_impl_t(flags, context_, device_);
+        if (!si) return status::out_of_memory;
+
+        *stream_impl = si;
+
+        return status::success;
+    }
+
+    status_t create_memory_storage(impl::memory_storage_t **storage,
+            impl::engine_t *engine, unsigned flags, size_t size,
+            void *handle) const override {
+        std::unique_ptr<memory_storage_t> _storage;
+        _storage.reset(
+                new memory_storage_t(engine, memory_storage_kind_t::device));
+        if (!_storage) return status::out_of_memory;
+
+        status_t status = _storage->init(flags, size, handle);
+        if (status != status::success) return status;
+
+        *storage = _storage.release();
+
+        return status::success;
+    }
+
+    engine_id_t engine_id() const override {
+        return engine_id_t(new engine_id_impl_t(
+                device(), context(), kind(), runtime_kind(), index()));
+    }
+
+    int get_buffer_alignment() const override { return 128; }
+
+private:
+    ze_driver_handle_t driver_;
+    ze_device_handle_t device_;
+    ze_context_handle_t context_;
+
+    xpu::ocl::wrapper_t<cl_device_id> ocl_device_;
+    xpu::ocl::wrapper_t<cl_context> ocl_context_;
+
+    engine_impl_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_impl_t);
+};
+
+status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
+        const ze_driver_handle_t dri, const ze_device_handle_t dev,
+        const ze_context_handle_t ctx, size_t index) {
+    std::unique_ptr<gpu::intel::l0::engine_t, engine_deleter_t> e(
+            (new gpu::intel::l0::engine_t(dri, dev, ctx, index)));
+    if (!e) return status::out_of_memory;
+
+    CHECK(e->init());
+    *engine = e.release();
+
+    return status::success;
+}
+
+engine_t::engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
+        ze_context_handle_t context, size_t index)
+    : gpu::intel::engine_t(new engine_impl_t(
+              engine_kind::gpu, driver, device, context, index)) {}
+
+status_t engine_t::init() {
+    CHECK(init_impl());
+    CHECK(gpu::intel::engine_t::init());
+
+    return status::success;
+}
+
+status_t engine_t::create_stream(
+        impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
+    return gpu::intel::l0::stream_t::create_stream(stream, this, stream_impl);
+}
+
+status_t engine_t::create_kernel(
+        compute::kernel_t *kernel, jit::generator_base_t *jitter) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+    return jitter->get_kernel(*kernel, this);
+}
+
+status_t engine_t::create_kernel(compute::kernel_t &kernel,
+        const gemmstone::dsl::kernel_t &kernel_dsl) const {
+    const auto &module_kernel_pair
+            = gemmstone::dsl::make_kernel(kernel_dsl, context(), device());
+    auto l0_module_ptr
+            = std::make_shared<module_wrapper_t>(module_kernel_pair.first);
+
+    return kernel_t::make(kernel, l0_module_ptr, module_kernel_pair.second, {});
+}
+
+status_t engine_t::convert_to_l0(
+        std::vector<gpu::intel::compute::kernel_t> &kernels,
+        const std::vector<const char *> &kernel_names,
+        xpu::binary_t &binary) const {
+    ze_module_handle_t l0_module = nullptr;
+    std::vector<ze_kernel_handle_t> l0_kernels;
+    CHECK(gpu::intel::l0::create_kernels(
+            device(), context(), kernel_names, binary, &l0_module, l0_kernels));
+    auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
+
+    kernels = std::vector<gpu::intel::compute::kernel_t>(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (!l0_kernels[i]) continue;
+        CHECK(kernel_t::make(
+                kernels[i], l0_module_ptr, l0_kernels[i], kernel_names[i]));
+    }
+
+    return status::success;
+}
+
+status_t engine_t::create_kernels(std::vector<compute::kernel_t> *kernels,
+        const std::vector<const char *> &kernel_names,
+        const compute::kernel_ctx_t &kernel_ctx) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+
+    const char *source = nullptr;
+    for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
+        source = intel::get_kernel_source(kernel_names[i]);
+
+    std::string options = kernel_ctx.options();
+    auto *dev_info = utils::downcast<const device_info_t *>(device_info());
+    options += " " + dev_info->get_cl_ext_options();
+
+    stringstream_t code_ss;
+    CHECK(compute::preprocess_headers(code_ss, source, kernel_ctx));
+    std::string code = code_ss.str();
+
+    gpu::intel::compute::program_src_t src(code);
+    if (src) { options += " -g -s " + std::string(src.name()); }
+
+    compute::debugdump_processed_source(
+            code, options, dev_info->get_cl_ext_options());
+
+    const char *code_c = code.c_str();
+    xpu::binary_t binary;
+    if (l0::compile_ocl_module_to_binary(
+                device(), context(), code, options, binary)
+            != status::success)
+        CHECK(ocl_build_kernels(
+                ocl_device(), ocl_context(), code_c, options.c_str(), binary));
+
+    if (kernel_ctx.has_custom_headers() && micro::hasMicrokernels(code_c)) {
+        try {
+            micro::fuseMicrokernels(binary, code_c);
+        } catch (...) { return status::runtime_error; }
+    }
+
+    CHECK(convert_to_l0(*kernels, kernel_names, binary));
+
+    return status::success;
+}
+
+status_t engine_t::create_kernel_from_binary(compute::kernel_t &kernel,
+        const xpu::binary_t &binary, const char *kernel_name,
+        const compute::program_src_t &src) const {
+    std::vector<const char *> kernel_names = {kernel_name};
+    ze_module_handle_t l0_module = nullptr;
+    std::vector<ze_kernel_handle_t> l0_kernels;
+    CHECK(gpu::intel::l0::create_kernels(
+            device(), context(), kernel_names, binary, &l0_module, l0_kernels));
+    auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
+
+    CHECK(kernel_t::make(kernel, l0_module_ptr, l0_kernels[0], kernel_name));
+
+    return status::success;
+}
+
+status_t engine_t::create_kernels_from_cache_blob(
+        const cache_blob_t &cache_blob, std::vector<compute::kernel_t> &kernels,
+        const std::vector<const char *> &kernel_names) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+
+    kernels = std::vector<compute::kernel_t>(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (!kernel_names[i] && kernel_names.size() > 1) continue;
+        std::string kernel_name(kernel_names[i] ? kernel_names[i] : "");
+
+        const uint8_t *binary_data = nullptr;
+        size_t binary_size = 0;
+        CHECK(cache_blob.get_binary(&binary_data, &binary_size));
+
+        xpu::binary_t binary(binary_data, binary_data + binary_size);
+        CHECK(create_kernel_from_binary(kernels[i], binary, kernel_names[i],
+                gpu::intel::compute::program_src_t()));
+    }
+
+    return status::success;
+}
+
+gpu_utils::device_id_t engine_t::device_id() const {
+    return std::tuple_cat(
+            std::make_tuple(1), gpu::intel::l0::get_device_uuid(device()));
+}
+
+const ze_driver_handle_t engine_t::driver() const {
+    return static_cast<const engine_impl_t *>(impl())->driver();
+}
+
+const ze_device_handle_t engine_t::device() const {
+    return static_cast<const engine_impl_t *>(impl())->device();
+}
+
+const ze_context_handle_t engine_t::context() const {
+    return static_cast<const engine_impl_t *>(impl())->context();
+}
+
+const cl_device_id engine_t::ocl_device() const {
+    return static_cast<const engine_impl_t *>(impl())->ocl_device();
+}
+
+const cl_context engine_t::ocl_context() const {
+    return static_cast<const engine_impl_t *>(impl())->ocl_context();
+}
+
+bool engine_t::mayiuse_microkernels() const {
+    if (!l0::mayiuse_microkernels(device(), context(),
+                std::string(compute::cl_microkernels_check_kernel_code))) {
+        return ocl_mayiuse_microkernels(ocl_device(), ocl_context(),
+                compute::cl_microkernels_check_kernel_code);
+    }
+    return true;
+}
+
+status_t engine_t::init_device_info() {
+    device_info_ = std::make_shared<gpu::intel::l0::device_info_t>();
+    CHECK(device_info_->init(this));
+
+    return status::success;
+}
+
+status_t engine_t::init_device_info(const std::vector<uint8_t> &cache_blob) {
+    gpu_assert(false) << "unimplemented function init_device_info() called";
+
+    return status::runtime_error;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/engine.hpp b/src/gpu/intel/l0/engine.hpp
new file mode 100644
index 00000000000..85bf4fb3538
--- /dev/null
+++ b/src/gpu/intel/l0/engine.hpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_ENGINE_HPP
+#define GPU_INTEL_L0_ENGINE_HPP
+
+#include "gpu/intel/engine.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "xpu/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+struct engine_id_impl_t : public impl::engine_id_impl_t {
+    engine_id_impl_t(const ze_device_handle_t device,
+            const ze_context_handle_t context, engine_kind_t kind,
+            runtime_kind_t runtime_kind, size_t index)
+        : impl::engine_id_impl_t(kind, runtime_kind, index)
+        , device_(device)
+        , context_(context) {}
+    ~engine_id_impl_t() override = default;
+
+private:
+    bool compare_resource(
+            const impl::engine_id_impl_t *id_impl) const override {
+        const auto *typed_id
+                = utils::downcast<const engine_id_impl_t *>(id_impl);
+        return device_ == typed_id->device_ && context_ == typed_id->context_;
+    }
+
+    size_t hash_resource() const override {
+        size_t seed = 0;
+        seed = hash_combine(seed, device_);
+        seed = hash_combine(seed, context_);
+        return seed;
+    }
+
+    ze_device_handle_t device_;
+    ze_context_handle_t context_;
+
+    engine_id_impl_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_id_impl_t);
+};
+
+status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
+        const ze_driver_handle_t dri, const ze_device_handle_t dev,
+        const ze_context_handle_t ctx, size_t index);
+
+class engine_t : public intel::engine_t {
+public:
+    engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
+            ze_context_handle_t context, size_t index);
+    ~engine_t() override = default;
+
+    status_t init() override;
+
+    status_t create_stream(
+            impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
+
+    status_t create_kernel(compute::kernel_t *kernel,
+            jit::generator_base_t *jitter) const override;
+    status_t create_kernel(compute::kernel_t &kernel,
+            const jit::dsl::kernel_t &kernel_ir) const override;
+    status_t create_kernels(std::vector<compute::kernel_t> *kernels,
+            const std::vector<const char *> &kernel_names,
+            const compute::kernel_ctx_t &kernel_ctx) const override;
+    status_t create_kernel_from_binary(compute::kernel_t &kernel,
+            const xpu::binary_t &binary, const char *kernel_name,
+            const compute::program_src_t &src) const override;
+    status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob,
+            std::vector<compute::kernel_t> &kernels,
+            const std::vector<const char *> &kernel_names) const override;
+
+    gpu::intel::gpu_utils::device_id_t device_id() const override;
+
+    const ze_driver_handle_t driver() const;
+    const ze_device_handle_t device() const;
+    const ze_context_handle_t context() const;
+
+    const cl_device_id ocl_device() const;
+    const cl_context ocl_context() const;
+
+    bool mayiuse_microkernels() const;
+
+private:
+    status_t init_device_info() override;
+    status_t init_device_info(const std::vector<uint8_t> &cache_blob) override;
+
+    status_t convert_to_l0(std::vector<gpu::intel::compute::kernel_t> &kernels,
+            const std::vector<const char *> &kernel_names,
+            xpu::binary_t &binary) const;
+
+    engine_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_ENGINE_HPP
diff --git a/src/gpu/intel/l0/engine_factory.cpp b/src/gpu/intel/l0/engine_factory.cpp
new file mode 100644
index 00000000000..e4a3b67d659
--- /dev/null
+++ b/src/gpu/intel/l0/engine_factory.cpp
@@ -0,0 +1,88 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/engine_factory.hpp"
+#include "gpu/intel/l0/engine.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+engine_factory_t::engine_factory_t(engine_kind_t engine_kind)
+    : engine_kind_(engine_kind) {
+    assert(utils::one_of(engine_kind_, engine_kind::gpu));
+}
+
+size_t engine_factory_t::count() const {
+    uint32_t driver_count = 0;
+    l0::zeDriverGet(&driver_count, nullptr);
+
+    std::vector<ze_driver_handle_t> drivers(driver_count);
+    l0::zeDriverGet(&driver_count, drivers.data());
+
+    uint32_t device_count = 0;
+    l0::zeDeviceGet(drivers[0], &device_count, nullptr);
+
+    return device_count;
+}
+
+status_t engine_factory_t::engine_create(
+        impl::engine_t **engine, size_t index) const {
+    ze_driver_handle_t driver = nullptr;
+    ze_device_handle_t device = nullptr;
+    ze_context_handle_t context = nullptr;
+
+    uint32_t driver_count = 0;
+    CHECK(l0::zeDriverGet(&driver_count, nullptr));
+
+    std::vector<ze_driver_handle_t> drivers(driver_count);
+    CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
+    driver = drivers[0];
+
+    uint32_t device_count = 0;
+    CHECK(l0::zeDeviceGet(driver, &device_count, nullptr));
+    VERROR_ENGINE(index < device_count, status::invalid_arguments,
+            "asked for device %zu but only %u devices are found", index,
+            device_count);
+
+    std::vector<ze_device_handle_t> devices(device_count);
+    CHECK(l0::zeDeviceGet(driver, &device_count, devices.data()));
+    device = devices[index];
+
+    ze_context_desc_t context_desc = {};
+    context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC;
+    context_desc.pNext = nullptr;
+    context_desc.flags = 0;
+
+    CHECK(l0::zeContextCreate(driver, &context_desc, &context));
+
+    return engine_create(engine, driver, device, context, index);
+}
+
+status_t engine_factory_t::engine_create(impl::engine_t **engine,
+        const ze_driver_handle_t driver, const ze_device_handle_t device,
+        const ze_context_handle_t context, size_t index) const {
+    return gpu::intel::l0::engine_create(
+            engine, engine_kind_, driver, device, context, index);
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/engine_factory.hpp b/src/gpu/intel/l0/engine_factory.hpp
new file mode 100644
index 00000000000..485eb7c8da1
--- /dev/null
+++ b/src/gpu/intel/l0/engine_factory.hpp
@@ -0,0 +1,59 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_ENGINE_FACTORY_HPP
+#define GPU_INTEL_L0_ENGINE_FACTORY_HPP
+
+#include "common/engine.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class engine_factory_t : public impl::engine_factory_t {
+public:
+    engine_factory_t(engine_kind_t engine_kind);
+    ~engine_factory_t() override = default;
+
+    size_t count() const override;
+    status_t engine_create(
+            impl::engine_t **engine, size_t index) const override;
+    status_t engine_create(impl::engine_t **engine,
+            const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
+            const ze_context_handle_t acontext, size_t index) const;
+
+private:
+    engine_kind_t engine_kind_;
+
+    engine_factory_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_factory_t);
+};
+
+inline std::unique_ptr<engine_factory_t> get_engine_factory(
+        engine_kind_t engine_kind) {
+    return std::unique_ptr<engine_factory_t>(new engine_factory_t(engine_kind));
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_ENGINE_FACTORY_HPP
diff --git a/src/gpu/intel/l0/kernel.cpp b/src/gpu/intel/l0/kernel.cpp
new file mode 100644
index 00000000000..cb879a5eb13
--- /dev/null
+++ b/src/gpu/intel/l0/kernel.cpp
@@ -0,0 +1,215 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/kernel.hpp"
+#include "gpu/intel/l0/context.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/memory_storage.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+// This class is to get around std::make_shared requirement to have a public
+// constructor. We keep the original constructor as private but expose it here
+// to use with std::make_shared.
+class kernel_compat_t : public kernel_t {
+public:
+    template <typename... Args>
+    kernel_compat_t(Args &&...args) : kernel_t(std::forward<Args>(args)...) {}
+};
+
+status_t kernel_t::make(compute::kernel_t &compute_kernel,
+        const std::shared_ptr<module_wrapper_t> module_ptr,
+        const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name) {
+    compute_kernel = compute::kernel_t(std::make_shared<kernel_compat_t>(
+            module_ptr, kernel_ptr, kernel_name));
+    return status::success;
+}
+
+kernel_t::kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
+        const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name)
+    : module_(module_ptr), kernel_(kernel_ptr), kernel_name_(kernel_name) {}
+
+kernel_t::~kernel_t() {
+    l0::zeKernelDestroy(kernel_);
+}
+
+status_t kernel_t::check_alignment(
+        const compute::kernel_arg_list_t &arg_list) const {
+    for (int i = 0; i < arg_list.nargs(); ++i) {
+        auto &arg = arg_list.get(i);
+        if (!arg.is_global()) continue;
+
+        auto *mem_storage = static_cast<const memory_storage_t *>(arg.value());
+        if (!*mem_storage) continue;
+
+        CHECK(compute::kernel_impl_t::check_alignment(
+                mem_storage->data_handle(), i));
+    }
+
+    return status::success;
+}
+
+status_t kernel_t::set_arg(
+        int arg_index, size_t arg_size, const void *arg_value) const {
+    return l0::zeKernelSetArgumentValue(
+            kernel_, arg_index, arg_size, arg_value);
+}
+
+status_t kernel_t::parallel_for(impl::stream_t &stream,
+        const compute::nd_range_t &range,
+        const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps,
+        xpu::event_t &out_dep) {
+    CHECK(check_scalar_arguments(arg_list));
+    CHECK(check_alignment(arg_list));
+
+    auto l0_stream = utils::downcast<stream_t *>(&stream);
+    auto l0_engine = l0_stream->l0_engine();
+    auto l0_device_info = l0_engine->device_info();
+
+    const size_t pointer_size = l0_device_info->device_address_bits() / 8;
+
+    size_t param_bytes = 0;
+    for (int i = 0; i < arg_list.nargs(); ++i) {
+        auto &arg = arg_list.get(i);
+        if (arg.is_global()) {
+            auto *mem_storage
+                    = static_cast<const memory_storage_t *>(arg.value());
+            if (!mem_storage->is_null()) {
+                auto memory_storage_ctx
+                        = utils::downcast<engine_t *>(mem_storage->engine())
+                                  ->context();
+                if (l0_engine->context() != memory_storage_ctx) {
+                    VERROR(primitive, gpu,
+                            "mismatched Level Zero context for "
+                            "primitive/memory");
+                    return status::invalid_arguments;
+                }
+
+                void *ptr = mem_storage->ptr();
+                CHECK(set_arg(i, pointer_size, &ptr));
+                param_bytes += pointer_size;
+            } else {
+                CHECK(set_arg(i, pointer_size, nullptr));
+                param_bytes += pointer_size;
+            }
+        } else if (arg.is_local()) {
+            CHECK(set_arg(i, arg.size(), arg.value()));
+            param_bytes += pointer_size;
+        } else {
+            CHECK(set_arg(i, arg.size(), arg.value()));
+            param_bytes += arg.size();
+        }
+    }
+    if (param_bytes > l0_device_info->max_kernel_param_size()) {
+        VERROR(primitive, gpu,
+                "parameter bytes requirements greater than device supports");
+        return status::invalid_arguments;
+    }
+
+    if (range.is_zero()) { return status::success; }
+
+    std::vector<uint32_t> global_size(3, 1);
+    switch (range.global_range().ndims()) {
+        case 3: global_size[2] = static_cast<uint32_t>(range.global_range()[2]);
+        case 2: global_size[1] = static_cast<uint32_t>(range.global_range()[1]);
+        case 1:
+            global_size[0] = static_cast<uint32_t>(range.global_range()[0]);
+            break;
+        default:
+            VERROR(primitive, gpu,
+                    "incorrect number of global range dimensions");
+            return status::invalid_arguments;
+    }
+
+    std::vector<uint32_t> group_size(3, 1);
+    if (range.local_range()) {
+        switch (range.local_range().ndims()) {
+            case 3:
+                group_size[2] = static_cast<uint32_t>(range.local_range()[2]);
+            case 2:
+                group_size[1] = static_cast<uint32_t>(range.local_range()[1]);
+            case 1:
+                group_size[0] = static_cast<uint32_t>(range.local_range()[0]);
+                break;
+            default:
+                VERROR(primitive, gpu,
+                        "incorrect number of local range dimensions");
+                return status::invalid_arguments;
+        }
+    } else {
+        CHECK(l0::zeKernelSuggestGroupSize(kernel_, global_size[0],
+                global_size[1], global_size[2], &group_size[0], &group_size[1],
+                &group_size[2]));
+    }
+
+    for (size_t i = 0; i < global_size.size(); i++) {
+        if (global_size[i] % group_size[i] != 0) {
+            VERROR(primitive, gpu, "only uniform work-groups are supported");
+            return status::invalid_arguments;
+        }
+    }
+
+    CHECK(l0::zeKernelSetGroupSize(
+            kernel_, group_size[0], group_size[1], group_size[2]));
+    ze_group_count_t group_count = {global_size[0] / group_size[0],
+            global_size[1] / group_size[1], global_size[2] / group_size[2]};
+
+    std::vector<ze_event_handle_t> l0_deps
+            = utils::downcast<const event_t *>(&deps)->events_;
+    std::vector<ze_event_handle_t> l0_out_deps
+            = utils::downcast<const event_t *>(&out_dep)->events_;
+
+    event_ = l0_stream->create_event();
+    ze_event_handle_t out_event = *(event_.get());
+
+    CHECK(l0::zeCommandListAppendLaunchKernel(l0_stream->list(), kernel_,
+            &group_count, out_event, static_cast<uint32_t>(l0_deps.size()),
+            l0_deps.size() ? l0_deps.data() : nullptr));
+
+    if (out_event) l0_out_deps.push_back(out_event);
+    if (stream.is_profiling_enabled()) {
+        l0_stream->profiler().register_event(
+                utils::make_unique<event_t>(std::move(out_event)));
+    }
+
+    return status::success;
+}
+
+status_t kernel_t::get_kernel_binary(xpu::binary_t &binary) const {
+    return l0::get_kernel_binary(kernel_, binary);
+}
+
+std::string kernel_t::name() const {
+    return kernel_name_;
+}
+
+status_t kernel_t::dump() const {
+    xpu::binary_t binary;
+    CHECK(get_kernel_binary(binary));
+
+    return gpu_utils::dump_kernel_binary(binary, kernel_name_);
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/kernel.hpp b/src/gpu/intel/l0/kernel.hpp
new file mode 100644
index 00000000000..0a6ef6631cd
--- /dev/null
+++ b/src/gpu/intel/l0/kernel.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_KERNEL_HPP
+#define GPU_INTEL_L0_KERNEL_HPP
+
+#include <thread>
+
+#include "common/rw_mutex.hpp"
+#include "gpu/intel/compute/kernel.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class kernel_t : public compute::kernel_impl_t {
+public:
+    static status_t make(compute::kernel_t &compute_kernel,
+            const std::shared_ptr<module_wrapper_t> module_ptr,
+            const ze_kernel_handle_t kernel_ptr,
+            const std::string &kernel_name);
+    ~kernel_t() override;
+
+    status_t check_alignment(
+            const compute::kernel_arg_list_t &arg_list) const override;
+    status_t set_arg(
+            int arg_index, size_t arg_size, const void *arg_value) const;
+    status_t parallel_for(impl::stream_t &stream,
+            const compute::nd_range_t &range,
+            const compute::kernel_arg_list_t &arg_list,
+            const xpu::event_t &deps, xpu::event_t &out_dep) override;
+
+    status_t get_kernel_binary(xpu::binary_t &binary) const override;
+    std::string name() const override;
+    status_t dump() const override;
+
+private:
+    friend class kernel_compat_t;
+    kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
+            const ze_kernel_handle_t kernel_ptr,
+            const std::string &kernel_name);
+
+    std::shared_ptr<module_wrapper_t> module_;
+    ze_kernel_handle_t kernel_;
+    std::string kernel_name_;
+
+    std::shared_ptr<ze_event_pool_handle_t> event_pool_;
+    std::shared_ptr<event_wrapper_t> event_;
+
+    kernel_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(kernel_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_KERNEL_HPP
diff --git a/src/gpu/intel/l0/memory_storage.cpp b/src/gpu/intel/l0/memory_storage.cpp
new file mode 100644
index 00000000000..e2ff707dc6d
--- /dev/null
+++ b/src/gpu/intel/l0/memory_storage.cpp
@@ -0,0 +1,215 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/memory_storage.hpp"
+#include "common/memory_map_manager.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t memory_storage_t::get_data_handle(void **handle) const {
+    *handle = ptr_.get();
+
+    return status::success;
+}
+
+status_t memory_storage_t::set_data_handle(void *handle) {
+    ptr_ = decltype(ptr_)(handle, [](void *) {});
+    kind_ = get_memory_storage_kind(
+            get_pointer_type(l0_engine()->context(), handle));
+
+    return status::success;
+}
+
+bool memory_storage_t::is_host_accessible() const {
+    return utils::one_of(kind_, memory_storage_kind_t::host,
+            memory_storage_kind_t::shared, memory_storage_kind_t::unknown);
+}
+
+struct map_usm_tag;
+
+status_t memory_storage_t::map_data(
+        void **mapped_ptr, impl::stream_t *stream, size_t size) const {
+    if (is_host_accessible()) {
+        *mapped_ptr = ptr();
+        return status::success;
+    }
+
+    if (!ptr() || size == 0) {
+        *mapped_ptr = nullptr;
+        return status::success;
+    }
+
+    if (!stream) CHECK(engine()->get_service_stream(stream));
+
+    void *host_ptr = malloc_host(size);
+    if (!host_ptr) return status::out_of_memory;
+
+    auto leak_guard = decltype(ptr_)(host_ptr, [this](void *p) { free(p); });
+    CHECK(memcpy(stream, host_ptr, ptr(), size));
+    CHECK(stream->wait());
+    leak_guard.release();
+
+    auto *usm_ptr_for_unmap = ptr();
+    auto unmap_callback = [size, usm_ptr_for_unmap, this](
+                                  impl::stream_t *stream, void *mapped_ptr) {
+        CHECK(memcpy(stream, usm_ptr_for_unmap, mapped_ptr, size));
+        CHECK(stream->wait());
+        free(mapped_ptr);
+
+        return status::success;
+    };
+
+    auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
+
+    *mapped_ptr = host_ptr;
+
+    return map_manager.map(this, stream, *mapped_ptr, unmap_callback);
+}
+
+status_t memory_storage_t::unmap_data(
+        void *mapped_ptr, impl::stream_t *stream) const {
+    if (!mapped_ptr || is_host_accessible()) return status::success;
+
+    if (!stream) CHECK(engine()->get_service_stream(stream));
+
+    auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
+
+    return map_manager.unmap(this, stream, mapped_ptr);
+}
+
+std::unique_ptr<impl::memory_storage_t> memory_storage_t::get_sub_storage(
+        size_t offset, size_t size) const {
+    void *sub_ptr
+            = ptr_ ? reinterpret_cast<uint8_t *>(ptr_.get()) + offset : nullptr;
+
+    auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
+    if (!storage) return nullptr;
+
+    auto status = storage->init(memory_flags_t::use_runtime_ptr, size, sub_ptr);
+    if (status != status::success) return nullptr;
+
+    // XXX: Clang has a bug that prevents implicit conversion.
+    return std::unique_ptr<memory_storage_t>(storage.release());
+}
+
+std::unique_ptr<impl::memory_storage_t> memory_storage_t::clone() const {
+    auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
+    if (!storage) return nullptr;
+
+    auto status = storage->init(memory_flags_t::use_runtime_ptr, 0, nullptr);
+    if (status != status::success) return nullptr;
+
+    storage->ptr_ = decltype(ptr_)(ptr_.get(), [](void *) {});
+    storage->kind_ = kind_;
+
+    // XXX: Clang has a bug that prevents implicit conversion.
+    return std::unique_ptr<memory_storage_t>(storage.release());
+}
+
+status_t memory_storage_t::init_allocate(size_t size) {
+    if (kind_ == memory_storage_kind_t::unknown)
+        kind_ = memory_storage_kind_t::device;
+
+    void *ptr_alloc = nullptr;
+
+    switch (kind_) {
+        case memory_storage_kind_t::host: ptr_alloc = malloc_host(size); break;
+        case memory_storage_kind_t::device:
+            ptr_alloc = malloc_device(size);
+            break;
+        case memory_storage_kind_t::shared:
+            ptr_alloc = malloc_shared(size);
+            break;
+        default: break;
+    }
+    if (!ptr_alloc) return status::out_of_memory;
+
+    ptr_ = decltype(ptr_)(ptr_alloc, [&](void *ptr) { free(ptr); });
+
+    return status::success;
+}
+
+void *memory_storage_t::malloc_host(size_t size) const {
+    void *pptr = nullptr;
+
+    ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
+    host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+    host_mem_alloc_desc.pNext = nullptr;
+    host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+
+    l0::zeMemAllocHost(
+            l0_engine()->context(), &host_mem_alloc_desc, size, 0, &pptr);
+
+    return pptr;
+}
+
+void *memory_storage_t::malloc_device(size_t size) const {
+    void *pptr = nullptr;
+
+    ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
+    device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+    device_mem_alloc_desc.pNext = nullptr;
+    device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+    device_mem_alloc_desc.ordinal = 0;
+
+    l0::zeMemAllocDevice(l0_engine()->context(), &device_mem_alloc_desc, size,
+            0, l0_engine()->device(), &pptr);
+
+    return pptr;
+}
+
+void *memory_storage_t::malloc_shared(size_t size) const {
+    void *pptr = nullptr;
+
+    ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
+    device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+    device_mem_alloc_desc.pNext = nullptr;
+    device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+    device_mem_alloc_desc.ordinal = 0;
+
+    ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
+    host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+    host_mem_alloc_desc.pNext = nullptr;
+    host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+
+    l0::zeMemAllocShared(l0_engine()->context(), &device_mem_alloc_desc,
+            &host_mem_alloc_desc, size, 0, l0_engine()->device(), &pptr);
+
+    return pptr;
+}
+
+void memory_storage_t::free(void *ptr) const {
+    l0::zeMemFree(l0_engine()->context(), ptr);
+}
+
+status_t memory_storage_t::memcpy(
+        impl::stream_t *stream, void *dst, const void *src, size_t size) const {
+    auto *l0_stream = utils::downcast<stream_t *>(stream);
+    return l0::zeCommandListAppendMemoryCopy(
+            l0_stream->list(), dst, src, size, nullptr, 0, nullptr);
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/memory_storage.hpp b/src/gpu/intel/l0/memory_storage.hpp
new file mode 100644
index 00000000000..ea994e9ce4f
--- /dev/null
+++ b/src/gpu/intel/l0/memory_storage.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_MEMORY_STORAGE_HPP
+#define GPU_INTEL_L0_MEMORY_STORAGE_HPP
+
+#include <functional>
+
+#include "common/c_types_map.hpp"
+#include "common/memory_storage.hpp"
+#include "common/utils.hpp"
+
+#include "gpu/intel/l0/engine.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+enum class memory_storage_kind_t { unknown, host, device, shared };
+inline memory_storage_kind_t get_memory_storage_kind(ze_memory_type_t type) {
+    switch (type) {
+        case ZE_MEMORY_TYPE_HOST: return memory_storage_kind_t::host;
+        case ZE_MEMORY_TYPE_DEVICE: return memory_storage_kind_t::device;
+        case ZE_MEMORY_TYPE_SHARED: return memory_storage_kind_t::shared;
+        default: return memory_storage_kind_t::unknown;
+    }
+}
+
+class memory_storage_t : public impl::memory_storage_t {
+public:
+    memory_storage_t(impl::engine_t *engine, memory_storage_kind_t kind)
+        : impl::memory_storage_t(engine), kind_(kind) {}
+
+    void *ptr() const { return ptr_.get(); }
+
+    status_t get_data_handle(void **handle) const override;
+    status_t set_data_handle(void *handle) override;
+
+    bool is_host_accessible() const override;
+
+    status_t map_data(void **mapped_ptr, impl::stream_t *stream,
+            size_t size) const override;
+    status_t unmap_data(
+            void *mapped_ptr, impl::stream_t *stream) const override;
+
+    std::unique_ptr<impl::memory_storage_t> get_sub_storage(
+            size_t offset, size_t size) const override;
+    std::unique_ptr<impl::memory_storage_t> clone() const override;
+
+private:
+    status_t init_allocate(size_t size) override;
+
+    gpu::intel::l0::engine_t *l0_engine() const {
+        return utils::downcast<gpu::intel::l0::engine_t *>(engine());
+    }
+
+    void *malloc_host(size_t size) const;
+    void *malloc_device(size_t size) const;
+    void *malloc_shared(size_t size) const;
+    void free(void *ptr) const;
+    status_t memcpy(impl::stream_t *stream, void *dst, const void *src,
+            size_t size) const;
+
+    std::unique_ptr<void, std::function<void(void *)>> ptr_;
+    memory_storage_kind_t kind_ = memory_storage_kind_t::unknown;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(memory_storage_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_MEMORY_STORAGE_HPP
diff --git a/src/gpu/intel/l0/stream.cpp b/src/gpu/intel/l0/stream.cpp
new file mode 100644
index 00000000000..72392751570
--- /dev/null
+++ b/src/gpu/intel/l0/stream.cpp
@@ -0,0 +1,233 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/stream.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/stream_profiler.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t stream_t::create_stream(impl::stream_t **stream,
+        impl::engine_t *engine, impl::stream_impl_t *stream_impl) {
+    std::unique_ptr<intel::l0::stream_t> s(
+            new intel::l0::stream_t(engine, stream_impl));
+    if (!s) return status::out_of_memory;
+
+    *stream = s.release();
+
+    return status::success;
+}
+
+stream_t::stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl)
+    : gpu::intel::stream_t(engine, stream_impl) {
+    if (is_profiling_enabled()) {
+        ze_device_properties_t device_properties = {};
+        device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2;
+        device_properties.pNext = nullptr;
+
+        l0::zeDeviceGetProperties(utils::downcast<engine_t *>(engine)->device(),
+                &device_properties);
+        profiler_ = utils::make_unique<l0::stream_profiler_t>(this,
+                1e9 / device_properties.timerResolution,
+                ~(-1L << device_properties.kernelTimestampValidBits));
+    }
+}
+
+void stream_t::before_exec_hook() {
+    if (is_profiling_enabled()) profiler_->start_profiling();
+}
+
+void stream_t::after_exec_hook() {
+    l0_ctx().set_deps(event_t());
+
+    if (is_profiling_enabled()) profiler_->stop_profiling();
+}
+
+status_t stream_t::reset_profiling() {
+    if (!is_profiling_enabled()) return status::invalid_arguments;
+
+    profiler_->reset();
+
+    return status::success;
+}
+
+status_t stream_t::get_profiling_data(profiling_data_kind_t data_kind,
+        int *num_entries, uint64_t *data) const {
+    if (!is_profiling_enabled()) return status::invalid_arguments;
+
+    return profiler_->get_info(data_kind, num_entries, data);
+}
+
+stream_impl_t::stream_impl_t(unsigned flags, ze_command_list_handle_t list)
+    : impl::stream_impl_t(flags)
+    , allocated_(false)
+    , list_(list)
+    , event_pool_(nullptr) {
+    l0::zeCommandListGetContextHandle(list_, &context_);
+    if (flags & stream_flags::out_of_order || is_profiling_enabled())
+        create_event_pool();
+}
+
+stream_impl_t::stream_impl_t(
+        unsigned flags, ze_context_handle_t context, ze_device_handle_t device)
+    : impl::stream_impl_t(flags)
+    , context_(context)
+    , allocated_(true)
+    , event_pool_(nullptr) {
+    ze_command_queue_desc_t command_queue_desc = {};
+    command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
+    command_queue_desc.pNext = nullptr;
+    command_queue_desc.ordinal = 0;
+    command_queue_desc.index = 0;
+    command_queue_desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
+    command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
+    command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
+
+    l0::zeCommandListCreateImmediate(
+            context_, device, &command_queue_desc, &list_);
+
+    if (flags & stream_flags::out_of_order || is_profiling_enabled())
+        create_event_pool();
+}
+
+void stream_impl_t::create_event_pool() {
+    ze_event_pool_desc_t event_pool_desc = {};
+    event_pool_desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
+    event_pool_desc.pNext = nullptr;
+    event_pool_desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+    if (is_profiling_enabled())
+        event_pool_desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
+    event_pool_desc.count = 16384;
+
+    ze_event_pool_handle_t event_pool;
+    l0::zeEventPoolCreate(context_, &event_pool_desc, 0, nullptr, &event_pool);
+    event_pool_ = std::make_shared<event_pool_wrapper_t>(event_pool);
+}
+
+stream_impl_t::~stream_impl_t() {
+    wait();
+    if (allocated_) l0::zeCommandListDestroy(list_);
+}
+
+xpu::context_t &stream_impl_t::ctx() {
+    return l0_ctx();
+}
+
+const xpu::context_t &stream_impl_t::ctx() const {
+    return l0_ctx();
+}
+
+context_t &stream_impl_t::l0_ctx() {
+    const context_t &ctx = const_cast<const stream_impl_t *>(this)->l0_ctx();
+    return *const_cast<context_t *>(&ctx);
+}
+
+const context_t &stream_impl_t::l0_ctx() const {
+    static context_t empty_ctx;
+    return ctx_.get(empty_ctx);
+}
+
+ze_event_handle_t stream_impl_t::get_output_event() const {
+    auto &deps = event_t::from(ctx().get_deps()).events_;
+    if (deps.size()) return deps[0];
+
+    return nullptr;
+}
+
+std::shared_ptr<event_wrapper_t> stream_impl_t::create_event() {
+    if (!event_pool_.get()) return std::make_shared<event_wrapper_t>(nullptr);
+
+    ze_event_desc_t event_desc = {};
+    event_desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
+    event_desc.pNext = nullptr;
+    event_desc.index = static_cast<uint32_t>(events_.size());
+    event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
+
+    ze_event_handle_t event;
+    l0::zeEventCreate(*(event_pool_.get()), &event_desc, &event);
+
+    std::shared_ptr<event_wrapper_t> event_ptr
+            = std::make_shared<event_wrapper_t>(event);
+    events_.push_back(event_ptr);
+
+    return event_ptr;
+}
+
+std::shared_ptr<event_pool_wrapper_t> stream_impl_t::get_event_pool() {
+    return event_pool_;
+}
+
+ze_command_list_handle_t stream_impl_t::list() {
+    return list_;
+}
+
+status_t stream_impl_t::wait() {
+    CHECK(l0::zeCommandListHostSynchronize(list_, UINT64_MAX));
+
+    return status::success;
+}
+
+status_t stream_impl_t::barrier() {
+    CHECK(l0::zeCommandListAppendBarrier(list_, nullptr, 0, nullptr));
+
+    return status::success;
+}
+
+status_t stream_impl_t::copy(const impl::memory_storage_t &src,
+        const impl::memory_storage_t &dst, size_t size,
+        const xpu::event_t &deps, xpu::event_t &out_dep) {
+    if (size == 0) return status::success;
+    std::vector<ze_event_handle_t> l0_deps
+            = utils::downcast<const event_t *>(&deps)->events_;
+
+    ze_event_handle_t out_event = *(create_event().get());
+    CHECK(l0::zeCommandListAppendMemoryCopy(list_, dst.data_handle(),
+            src.data_handle(), size, out_event,
+            static_cast<uint32_t>(l0_deps.size()),
+            l0_deps.size() ? l0_deps.data() : nullptr));
+    if (out_event)
+        utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
+
+    return status::success;
+}
+
+status_t stream_impl_t::fill(const impl::memory_storage_t &dst, uint8_t pattern,
+        size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) {
+    if (size == 0) return status::success;
+    std::vector<ze_event_handle_t> l0_deps
+            = utils::downcast<const event_t *>(&deps)->events_;
+
+    ze_event_handle_t out_event = *(create_event().get());
+    CHECK(l0::zeCommandListAppendMemoryFill(list_, dst.data_handle(), &pattern,
+            sizeof(pattern), size, out_event,
+            static_cast<uint32_t>(l0_deps.size()),
+            l0_deps.size() ? l0_deps.data() : nullptr));
+    if (out_event)
+        utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
+
+    return status::success;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/stream.hpp b/src/gpu/intel/l0/stream.hpp
new file mode 100644
index 00000000000..f3e70457520
--- /dev/null
+++ b/src/gpu/intel/l0/stream.hpp
@@ -0,0 +1,136 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_STREAM_HPP
+#define GPU_INTEL_L0_STREAM_HPP
+
+#include <list>
+
+#include "common/thread_local_storage.hpp"
+#include "gpu/intel/l0/context.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "gpu/intel/stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class stream_impl_t : public impl::stream_impl_t {
+public:
+    stream_impl_t(unsigned flags, ze_command_list_handle_t list);
+    stream_impl_t(unsigned flags, ze_context_handle_t context,
+            ze_device_handle_t device);
+    ~stream_impl_t();
+
+    context_t &l0_ctx();
+    const context_t &l0_ctx() const;
+    xpu::context_t &ctx();
+    const xpu::context_t &ctx() const;
+    ze_event_handle_t get_output_event() const;
+    std::shared_ptr<event_wrapper_t> create_event();
+    std::shared_ptr<event_pool_wrapper_t> get_event_pool();
+
+    ze_command_list_handle_t list();
+
+    status_t wait();
+    status_t barrier();
+
+    status_t copy(const impl::memory_storage_t &src,
+            const impl::memory_storage_t &dst, size_t size,
+            const xpu::event_t &deps, xpu::event_t &out_dep);
+    status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
+            size_t size, const xpu::event_t &deps, xpu::event_t &out_dep);
+
+private:
+    void create_event_pool();
+
+    ze_context_handle_t context_;
+    bool allocated_;
+    ze_command_list_handle_t list_;
+
+    std::shared_ptr<event_pool_wrapper_t> event_pool_;
+    std::list<std::shared_ptr<event_wrapper_t>> events_;
+
+    mutable utils::thread_local_storage_t<context_t> ctx_;
+
+    stream_impl_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(stream_impl_t);
+};
+
+class stream_t : public intel::stream_t {
+public:
+    static status_t create_stream(impl::stream_t **stream,
+            impl::engine_t *engine, impl::stream_impl_t *stream_impl);
+
+    stream_impl_t *impl() const {
+        return static_cast<stream_impl_t *>(impl::stream_t::impl_.get());
+    }
+
+    engine_t *l0_engine() const {
+        return utils::downcast<engine_t *>(engine());
+    }
+
+    context_t &l0_ctx() { return impl()->l0_ctx(); }
+    const context_t &l0_ctx() const { return impl()->l0_ctx(); }
+    xpu::context_t &ctx() override { return impl()->ctx(); }
+    const xpu::context_t &ctx() const override { return impl()->ctx(); }
+    ze_event_handle_t get_output_event() const {
+        return impl()->get_output_event();
+    }
+    std::shared_ptr<event_wrapper_t> create_event() {
+        return impl()->create_event();
+    }
+    std::shared_ptr<event_pool_wrapper_t> get_event_pool() {
+        return impl()->get_event_pool();
+    }
+
+    const ze_command_list_handle_t list() const { return impl()->list(); }
+
+    status_t wait() override { return impl()->wait(); }
+    status_t barrier() override { return impl()->barrier(); }
+
+    void before_exec_hook() override;
+    void after_exec_hook() override;
+    status_t reset_profiling() override;
+    status_t get_profiling_data(profiling_data_kind_t data_kind,
+            int *num_entries, uint64_t *data) const override;
+
+    status_t copy(const impl::memory_storage_t &src,
+            const impl::memory_storage_t &dst, size_t size,
+            const xpu::event_t &deps, xpu::event_t &out_dep) override {
+        return impl()->copy(src, dst, size, deps, out_dep);
+    }
+    status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
+            size_t size, const xpu::event_t &deps,
+            xpu::event_t &out_dep) override {
+        return impl()->fill(dst, pattern, size, deps, out_dep);
+    }
+
+private:
+    stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl);
+    stream_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(stream_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_STREAM_HPP
diff --git a/src/gpu/intel/l0/stream_profiler.hpp b/src/gpu/intel/l0/stream_profiler.hpp
new file mode 100644
index 00000000000..f872732b41c
--- /dev/null
+++ b/src/gpu/intel/l0/stream_profiler.hpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_STREAM_PROFILER_HPP
+#define GPU_INTEL_L0_STREAM_PROFILER_HPP
+
+#include <unordered_set>
+
+#include "gpu/intel/l0/context.hpp"
+#include "xpu/stream_profiler.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class stream_profiler_t : public xpu::stream_profiler_t {
+public:
+    class entry_t {
+    public:
+        entry_t() = delete;
+
+        entry_t(ze_kernel_timestamp_result_t &kernel_timestamp_result,
+                uint64_t max_timestamp_value, double timestamp_freq)
+            : context_(get_timestamp(
+                      kernel_timestamp_result.context, max_timestamp_value))
+            , freq_(timestamp_freq) {}
+
+        uint64_t get_cycles() const { return context_; }
+
+        uint64_t get_nsec() const { return get_cycles() * freq_; }
+
+    private:
+        uint64_t get_timestamp(
+                ze_kernel_timestamp_data_t &ts, uint64_t max_timestamp_value) {
+            return (ts.kernelEnd >= ts.kernelStart)
+                    ? (ts.kernelEnd - ts.kernelStart)
+                    : ((max_timestamp_value - ts.kernelStart) + ts.kernelEnd
+                              + 1);
+        }
+
+        uint64_t context_;
+        double freq_;
+    };
+
+    stream_profiler_t(const impl::stream_t *stream, double timestamp_freq,
+            uint64_t max_timestamp_value)
+        : xpu::stream_profiler_t(stream)
+        , timestamp_freq_(timestamp_freq)
+        , max_timestamp_value_(max_timestamp_value) {}
+
+    status_t get_info(profiling_data_kind_t data_kind, int *num_entries,
+            uint64_t *data) const override {
+        if (!num_entries) return status::invalid_arguments;
+
+        bool is_per_kernel
+                = (data_kind == profiling_data_kind::time_per_kernel);
+        if (!data) {
+            if (is_per_kernel) {
+                *num_entries = (int)events_.size();
+                return status::success;
+            }
+            std::unordered_set<uint64_t> seen;
+            for (auto &ev : events_)
+                seen.insert(ev.stamp);
+            *num_entries = (int)seen.size();
+            return status::success;
+        }
+
+        std::map<uint64_t, entry_t> stamp2entry;
+        int idx = 0;
+        for (auto &ev : events_) {
+            const l0::event_t &l0_event
+                    = *utils::downcast<l0::event_t *>(ev.event.get());
+
+            ze_kernel_timestamp_result_t kernel_timestamp_result;
+            CHECK(l0::zeEventQueryKernelTimestamp(
+                    l0_event[0], &kernel_timestamp_result));
+
+            entry_t entry(kernel_timestamp_result, max_timestamp_value_,
+                    timestamp_freq_);
+            if (is_per_kernel) {
+                data[idx++] = entry.get_nsec();
+                continue;
+            }
+            stamp2entry.emplace(ev.stamp, entry);
+        }
+        if (is_per_kernel) return status::success;
+
+        return get_info_impl(stamp2entry, data_kind, data);
+    }
+
+private:
+    stream_profiler_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(stream_profiler_t);
+
+    status_t get_info_impl(const std::map<uint64_t, entry_t> &stamp2entry,
+            profiling_data_kind_t data_kind, uint64_t *data) const {
+        int idx = 0;
+        for (auto &kv : stamp2entry) {
+            auto &e = kv.second;
+            switch ((int)data_kind) {
+                case profiling_data_kind::time: data[idx] = e.get_nsec(); break;
+                case profiling_data_kind::cycles: {
+                    data[idx] = e.get_cycles();
+                    if (callback_) callback_(kv.first, e.get_nsec());
+                    break;
+                }
+                default: assert(!"unexpected data kind");
+            }
+            idx++;
+        }
+        return status::success;
+    }
+
+    double timestamp_freq_;
+    uint64_t max_timestamp_value_;
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_STREAM_PROFILER_HPP
diff --git a/src/gpu/intel/l0/utils/CMakeLists.txt b/src/gpu/intel/l0/utils/CMakeLists.txt
new file mode 100644
index 00000000000..a4bd6d8c3a5
--- /dev/null
+++ b/src/gpu/intel/l0/utils/CMakeLists.txt
@@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+
+set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0_utils)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
diff --git a/src/gpu/intel/l0/utils/utils.cpp b/src/gpu/intel/l0/utils/utils.cpp
new file mode 100644
index 00000000000..2c2af1ccdd4
--- /dev/null
+++ b/src/gpu/intel/l0/utils/utils.cpp
@@ -0,0 +1,408 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "gpu/intel/jit/binary_format.hpp"
+#include "gpu/intel/jit/utils/type_bridge.hpp"
+#include "ngen_level_zero.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+event_wrapper_t::event_wrapper_t(ze_event_handle_t event) : event_(event) {}
+
+event_wrapper_t::~event_wrapper_t() {
+    if (event_) {
+        l0::zeEventHostSynchronize(event_, UINT64_MAX);
+        l0::zeEventDestroy(event_);
+    }
+}
+
+event_wrapper_t::operator ze_event_handle_t() const {
+    return event_;
+}
+
+event_pool_wrapper_t::event_pool_wrapper_t(ze_event_pool_handle_t event_pool)
+    : event_pool_(event_pool) {}
+
+event_pool_wrapper_t::~event_pool_wrapper_t() {
+    if (event_pool_) l0::zeEventPoolDestroy(event_pool_);
+}
+
+event_pool_wrapper_t::operator ze_event_pool_handle_t() const {
+    return event_pool_;
+}
+
+module_wrapper_t::module_wrapper_t(ze_module_handle_t module)
+    : module_(module) {}
+
+module_wrapper_t::~module_wrapper_t() {
+    if (module_) l0::zeModuleDestroy(module_);
+}
+
+module_wrapper_t::operator ze_module_handle_t() const {
+    return module_;
+}
+
+status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
+    ze_device_ip_version_ext_t device_ip_version_ext = {};
+    device_ip_version_ext.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+    device_ip_version_ext.pNext = nullptr;
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = &device_ip_version_ext;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+
+    ip_version = device_ip_version_ext.ipVersion;
+
+    return status::success;
+}
+
+status_t get_l0_device_enabled_systolic_intel(
+        ze_device_handle_t device, bool &mayiuse_systolic) {
+    ze_intel_device_module_dp_exp_properties_t
+            intel_device_module_dp_exp_properties
+            = {};
+    intel_device_module_dp_exp_properties.stype
+            = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
+    intel_device_module_dp_exp_properties.pNext = nullptr;
+
+    ze_device_module_properties_t device_module_properties = {};
+    device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
+    device_module_properties.pNext = &intel_device_module_dp_exp_properties;
+
+    CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
+
+    mayiuse_systolic = intel_device_module_dp_exp_properties.flags
+            & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
+
+    return status::success;
+}
+
+status_t get_l0_device_enabled_native_float_atomics(
+        ze_device_handle_t device, uint64_t &native_extensions) {
+    using namespace gpu::intel::compute;
+
+    ze_float_atomic_ext_properties_t float_atomic_ext_properties = {};
+    float_atomic_ext_properties.stype
+            = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
+    float_atomic_ext_properties.pNext = nullptr;
+
+    ze_device_module_properties_t device_module_properties = {};
+    device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
+    device_module_properties.pNext = &float_atomic_ext_properties;
+
+    CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
+
+    ze_device_fp_atomic_ext_flags_t atomic_load_store
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
+    ze_device_fp_atomic_ext_flags_t atomic_add
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
+    ze_device_fp_atomic_ext_flags_t atomic_min_max
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
+
+    if ((float_atomic_ext_properties.fp16Flags & atomic_load_store)
+            == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
+    if ((float_atomic_ext_properties.fp16Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
+    if ((float_atomic_ext_properties.fp16Flags & atomic_min_max)
+            == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
+
+    if ((float_atomic_ext_properties.fp32Flags & atomic_load_store)
+            == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
+    if ((float_atomic_ext_properties.fp32Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
+    if ((float_atomic_ext_properties.fp32Flags & atomic_min_max)
+            == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
+
+    if ((float_atomic_ext_properties.fp64Flags & atomic_load_store)
+            == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
+    if ((float_atomic_ext_properties.fp64Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
+    if ((float_atomic_ext_properties.fp64Flags & atomic_min_max)
+            == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
+
+    return status::success;
+}
+
+status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
+    ze_eu_count_ext_t eu_count_ext = {};
+    eu_count_ext.stype = ZE_STRUCTURE_TYPE_EU_COUNT_EXT;
+    eu_count_ext.pNext = nullptr;
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = &eu_count_ext;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+
+    eu_count = eu_count_ext.numTotalEUs;
+
+    return status::success;
+}
+
+status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
+        ze_context_handle_t context, uint32_t &ip_version,
+        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
+        uint64_t &native_extensions, bool &mayiuse_systolic,
+        bool &mayiuse_ngen_kernels) {
+    using namespace ngen;
+    ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
+            context, device);
+
+    gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
+    std::memcpy(&product_, &product, sizeof(ngen::Product));
+
+    mayiuse_systolic = false;
+    if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
+            != status::success)
+        mayiuse_systolic = false;
+
+    /* Some old drivers do not report systolic availability. Manually override
+       systolic availability based on product family. */
+    switch (product.family) {
+        case ProductFamily::DG2:
+        case ProductFamily::ARL:
+        case ProductFamily::PVC: mayiuse_systolic = true;
+        default: break;
+    }
+
+    CHECK(get_l0_device_enabled_native_float_atomics(
+            device, native_extensions));
+
+    auto status
+            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
+    if (status != status::success) mayiuse_ngen_kernels = false;
+
+    ip_version = 0;
+
+    return get_device_ip(device, ip_version);
+}
+
+xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device) {
+    static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
+            "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = nullptr;
+
+    auto status = l0::zeDeviceGetProperties(device, &device_properties);
+    MAYBE_UNUSED(status);
+    assert(status == status::success);
+
+    const auto &device_id = device_properties.uuid.id;
+
+    uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
+    for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
+        size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
+        uuid[i / sizeof(uint64_t)] |= (((uint64_t)device_id[i]) << shift);
+    }
+
+    return xpu::device_uuid_t(uuid[0], uuid[1]);
+}
+
+status_t get_device_index(const ze_device_handle_t device, size_t *index) {
+    uint32_t driver_count = 0;
+    CHECK(l0::zeDriverGet(&driver_count, nullptr));
+
+    std::vector<ze_driver_handle_t> drivers(driver_count);
+    CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
+
+    uint32_t device_count = 0;
+    CHECK(l0::zeDeviceGet(drivers[0], &device_count, nullptr));
+
+    std::vector<ze_device_handle_t> devices(device_count);
+    CHECK(l0::zeDeviceGet(drivers[0], &device_count, devices.data()));
+
+    for (size_t i = 0; i < device_count; i++) {
+        if (device == devices[i]) {
+            *index = i;
+
+            return status::success;
+        }
+    }
+
+    return status::invalid_arguments;
+}
+
+std::string get_kernel_name(const ze_kernel_handle_t kernel) {
+    std::string kernel_name;
+
+    size_t kernel_name_size = 0;
+    l0::zeKernelGetName(kernel, &kernel_name_size, nullptr);
+
+    kernel_name.resize(kernel_name_size, 0);
+    l0::zeKernelGetName(kernel, &kernel_name_size, &kernel_name[0]);
+
+    // Remove the null terminator as std::string already includes it
+    kernel_name.resize(kernel_name_size - 1);
+
+    return kernel_name;
+}
+
+status_t get_module_binary(
+        const ze_module_handle_t module, xpu::binary_t &binary) {
+    size_t module_binary_size;
+    CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr));
+
+    binary.resize(module_binary_size);
+    CHECK(l0::zeModuleGetNativeBinary(
+            module, &module_binary_size, binary.data()));
+
+    return status::success;
+}
+
+status_t get_kernel_binary(
+        const ze_kernel_handle_t kernel, xpu::binary_t &binary) {
+    size_t binary_size = 0;
+    CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, nullptr));
+
+    binary.resize(binary_size);
+    CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, binary.data()));
+
+    return status::success;
+}
+
+static inline ze_result_t func_zeModuleCreate(ze_context_handle_t hContext,
+        ze_device_handle_t hDevice, const ze_module_desc_t *desc,
+        ze_module_handle_t *phModule,
+        ze_module_build_log_handle_t *phBuildLog) {
+    const ze_init_flags_t default_ze_flags = 0;
+    static auto init_ = find_ze_symbol<decltype(&::zeInit)>("zeInit");
+    if (!init_) return ZE_RESULT_ERROR_NOT_AVAILABLE;
+    init_(default_ze_flags);
+
+    static auto f_
+            = find_ze_symbol<decltype(&::zeModuleCreate)>("zeModuleCreate");
+    if (!f_) return ZE_RESULT_ERROR_NOT_AVAILABLE;
+    return f_(hContext, hDevice, desc, phModule, phBuildLog);
+}
+
+#define ZE_MODULE_FORMAT_OCLC (ze_module_format_t)3U
+static inline ze_module_handle_t compile_ocl_module(
+        const ze_device_handle_t device, const ze_context_handle_t context,
+        const std::string &code, const std::string &options) {
+    ze_module_desc_t module_desc;
+    module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+    module_desc.pNext = nullptr;
+    module_desc.format = ZE_MODULE_FORMAT_OCLC;
+    module_desc.inputSize = code.size();
+    module_desc.pInputModule = reinterpret_cast<const uint8_t *>(code.c_str());
+    module_desc.pBuildFlags = options.c_str();
+    module_desc.pConstants = nullptr;
+
+    ze_module_handle_t module_handle;
+    ze_module_build_log_handle_t module_build_log_handle;
+    ze_result_t ret = func_zeModuleCreate(context, device, &module_desc,
+            &module_handle, &module_build_log_handle);
+    if (ret != ZE_RESULT_SUCCESS) return nullptr;
+    return module_handle;
+}
+
+bool mayiuse_microkernels(const ze_device_handle_t device,
+        const ze_context_handle_t context, const std::string &code) {
+    ze_module_handle_t module_handle
+            = compile_ocl_module(device, context, code, "");
+    if (module_handle) {
+        l0::zeModuleDestroy(module_handle);
+        return true;
+    }
+    return false;
+}
+
+status_t compile_ocl_module_to_binary(const ze_device_handle_t device,
+        const ze_context_handle_t context, const std::string &code,
+        const std::string &options, xpu::binary_t &binary) {
+    ze_module_handle_t module_handle
+            = compile_ocl_module(device, context, code, options);
+    if (!module_handle) { return status::runtime_error; }
+    CHECK(l0::get_module_binary(module_handle, binary));
+    CHECK(l0::zeModuleDestroy(module_handle));
+
+    return status::success;
+}
+
+status_t create_kernels(const ze_device_handle_t device,
+        const ze_context_handle_t context,
+        const std::vector<const char *> &kernel_names,
+        const xpu::binary_t &binary, ze_module_handle_t *module,
+        std::vector<ze_kernel_handle_t> &kernels) {
+    ze_module_desc_t module_desc;
+    module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+    module_desc.pNext = nullptr;
+    module_desc.format = ZE_MODULE_FORMAT_NATIVE;
+    module_desc.inputSize = binary.size();
+    module_desc.pInputModule = binary.data();
+    module_desc.pBuildFlags = "";
+    module_desc.pConstants = nullptr;
+
+    CHECK(l0::zeModuleCreate(context, device, &module_desc, module, nullptr));
+
+    kernels.resize(kernel_names.size(), nullptr);
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (kernel_names[i] == nullptr) {
+            kernels[i] = nullptr;
+            continue;
+        }
+
+        ze_kernel_desc_t kernel_desc = {};
+        kernel_desc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
+        kernel_desc.pNext = nullptr;
+        kernel_desc.flags = 0;
+        kernel_desc.pKernelName = kernel_names[i];
+
+        ze_kernel_handle_t kernel;
+        CHECK(l0::zeKernelCreate(*module, &kernel_desc, &kernel));
+
+        kernels[i] = kernel;
+    }
+
+    return status::success;
+}
+
+ze_memory_type_t get_pointer_type(
+        const ze_context_handle_t context, const void *ptr) {
+    ze_memory_allocation_properties_t memory_allocation_properties;
+    memory_allocation_properties.stype
+            = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
+    memory_allocation_properties.pNext = nullptr;
+
+    l0::zeMemGetAllocProperties(
+            context, ptr, &memory_allocation_properties, nullptr);
+
+    return memory_allocation_properties.type;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/l0/utils/utils.hpp b/src/gpu/intel/l0/utils/utils.hpp
new file mode 100644
index 00000000000..efd47be85dc
--- /dev/null
+++ b/src/gpu/intel/l0/utils/utils.hpp
@@ -0,0 +1,234 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_UTILS_UTILS_HPP
+#define GPU_INTEL_L0_UTILS_UTILS_HPP
+
+#if defined(__linux__)
+#include <dlfcn.h>
+#elif defined(_WIN32)
+#include "windows.h"
+#else
+#error "Level Zero is supported on Linux and Windows only"
+#endif
+
+#include "gpu/intel/compute/kernel.hpp"
+
+#include "level_zero/ze_api.h"
+#include "level_zero/ze_intel_gpu.h"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+static inline std::string to_string(ze_result_t r) {
+#define ZE_STATUS_CASE(status) \
+    case status: return #status
+    switch (r) {
+        ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
+        ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
+        ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
+        default: return std::to_string((int)r);
+    }
+#undef ZE_STATUS_CASE
+}
+
+#define ZE_CHECK(f) \
+    do { \
+        ze_result_t res_ = (f); \
+        if (res_ != ZE_RESULT_SUCCESS) { \
+            std::string err_str_ = to_string(res_); \
+            VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
+            return status::runtime_error; \
+        } \
+    } while (false)
+
+#if defined(_WIN32)
+#define L0_LIB_NAME "ze_loader.dll"
+#elif defined(__linux__)
+#define L0_LIB_NAME "libze_loader.so.1"
+#endif
+
+template <typename F>
+F find_ze_symbol(const char *symbol) {
+    return (F)xpu::find_symbol(L0_LIB_NAME, symbol);
+}
+#undef L0_LIB_NAME
+
+#define INDIRECT_L0_CALL(f) \
+    template <typename... Args> \
+    status_t f(Args &&...args) { \
+        const ze_init_flags_t default_ze_flags = 0; \
+        static auto init_ = find_ze_symbol<decltype(&::zeInit)>("zeInit"); \
+        if (!init_) return status::runtime_error; \
+        ZE_CHECK(init_(default_ze_flags)); \
+        static auto f_ = find_ze_symbol<decltype(&::f)>(#f); \
+        if (!f_) return status::runtime_error; \
+        ZE_CHECK(f_(std::forward<Args>(args)...)); \
+        return status::success; \
+    }
+INDIRECT_L0_CALL(zeDriverGet)
+INDIRECT_L0_CALL(zeDriverGetProperties)
+INDIRECT_L0_CALL(zeDeviceGet)
+INDIRECT_L0_CALL(zeDeviceGetProperties)
+INDIRECT_L0_CALL(zeDeviceGetComputeProperties)
+INDIRECT_L0_CALL(zeDeviceGetModuleProperties)
+INDIRECT_L0_CALL(zeDeviceGetMemoryAccessProperties)
+INDIRECT_L0_CALL(zeDeviceGetCacheProperties)
+INDIRECT_L0_CALL(zeContextCreate)
+INDIRECT_L0_CALL(zeContextDestroy)
+INDIRECT_L0_CALL(zeCommandListCreateImmediate)
+INDIRECT_L0_CALL(zeCommandListDestroy)
+INDIRECT_L0_CALL(zeCommandListHostSynchronize)
+INDIRECT_L0_CALL(zeCommandListGetContextHandle)
+INDIRECT_L0_CALL(zeCommandListAppendBarrier)
+INDIRECT_L0_CALL(zeCommandListAppendMemoryCopy)
+INDIRECT_L0_CALL(zeCommandListAppendMemoryFill)
+INDIRECT_L0_CALL(zeEventPoolCreate)
+INDIRECT_L0_CALL(zeEventPoolDestroy)
+INDIRECT_L0_CALL(zeEventCreate)
+INDIRECT_L0_CALL(zeEventDestroy)
+INDIRECT_L0_CALL(zeEventHostSynchronize)
+INDIRECT_L0_CALL(zeEventQueryKernelTimestamp)
+INDIRECT_L0_CALL(zeMemAllocShared)
+INDIRECT_L0_CALL(zeMemAllocDevice)
+INDIRECT_L0_CALL(zeMemAllocHost)
+INDIRECT_L0_CALL(zeMemFree)
+INDIRECT_L0_CALL(zeMemGetAllocProperties)
+INDIRECT_L0_CALL(zeModuleCreate)
+INDIRECT_L0_CALL(zeModuleDestroy)
+INDIRECT_L0_CALL(zeModuleBuildLogGetString)
+INDIRECT_L0_CALL(zeModuleBuildLogDestroy)
+INDIRECT_L0_CALL(zeModuleGetNativeBinary)
+INDIRECT_L0_CALL(zeKernelCreate)
+INDIRECT_L0_CALL(zeKernelDestroy)
+INDIRECT_L0_CALL(zeKernelSetArgumentValue)
+INDIRECT_L0_CALL(zeKernelGetName)
+INDIRECT_L0_CALL(zeKernelGetBinaryExp)
+INDIRECT_L0_CALL(zeKernelSetGroupSize)
+INDIRECT_L0_CALL(zeKernelSuggestGroupSize)
+INDIRECT_L0_CALL(zeCommandListAppendLaunchKernel)
+#undef INDIRECT_L0_CALL
+
+class event_wrapper_t {
+public:
+    event_wrapper_t(ze_event_handle_t event);
+    ~event_wrapper_t();
+    operator ze_event_handle_t() const;
+
+private:
+    ze_event_handle_t event_;
+
+    event_wrapper_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(event_wrapper_t);
+};
+
+class event_pool_wrapper_t {
+public:
+    event_pool_wrapper_t(ze_event_pool_handle_t event_pool);
+    ~event_pool_wrapper_t();
+    operator ze_event_pool_handle_t() const;
+
+private:
+    ze_event_pool_handle_t event_pool_;
+
+    event_pool_wrapper_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(event_pool_wrapper_t);
+};
+
+class module_wrapper_t {
+public:
+    module_wrapper_t(ze_module_handle_t module);
+    ~module_wrapper_t();
+    operator ze_module_handle_t() const;
+
+private:
+    ze_module_handle_t module_;
+
+    module_wrapper_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(module_wrapper_t);
+};
+
+status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
+        ze_context_handle_t context, uint32_t &ip_version,
+        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
+        uint64_t &native_extensions, bool &mayiuse_systolic,
+        bool &mayiuse_ngen_kernels);
+xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device);
+status_t get_device_index(const ze_device_handle_t device, size_t *index);
+std::string get_kernel_name(const ze_kernel_handle_t kernel);
+status_t get_module_binary(
+        const ze_module_handle_t module, xpu::binary_t &binary);
+status_t get_kernel_binary(
+        const ze_kernel_handle_t kernel, xpu::binary_t &binary);
+bool mayiuse_microkernels(const ze_device_handle_t device,
+        const ze_context_handle_t context, const std::string &code);
+status_t compile_ocl_module_to_binary(const ze_device_handle_t device,
+        const ze_context_handle_t context, const std::string &code,
+        const std::string &options, xpu::binary_t &binary);
+status_t create_kernels(const ze_device_handle_t device,
+        const ze_context_handle_t context,
+        const std::vector<const char *> &kernel_names,
+        const xpu::binary_t &binary, ze_module_handle_t *module,
+        std::vector<ze_kernel_handle_t> &kernels);
+ze_memory_type_t get_pointer_type(const ze_context_handle_t, const void *ptr);
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_UTILS_UTILS_HPP
diff --git a/src/gpu/intel/ocl/CMakeLists.txt b/src/gpu/intel/ocl/CMakeLists.txt
index b5e057a405e..2423aee56c0 100644
--- a/src/gpu/intel/ocl/CMakeLists.txt
+++ b/src/gpu/intel/ocl/CMakeLists.txt
@@ -15,9 +15,7 @@
 #===============================================================================
 
 file(GLOB_RECURSE SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.h
     ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.c
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
     )
 
diff --git a/src/gpu/intel/ocl/device_info.cpp b/src/gpu/intel/ocl/device_info.cpp
index 7273a058c81..e55be6d92f5 100644
--- a/src/gpu/intel/ocl/device_info.cpp
+++ b/src/gpu/intel/ocl/device_info.cpp
@@ -92,19 +92,11 @@ status_t device_info_t::init_runtime_version(impl::engine_t *engine) {
 }
 
 status_t device_info_t::init_extensions(impl::engine_t *engine) {
-    cl_int err = CL_SUCCESS;
     auto device = utils::downcast<const engine_t *>(engine)->device();
 
     // query device for extensions
-    size_t param_size = 0;
-    err = xpu::ocl::clGetDeviceInfo(
-            device, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
-    OCL_CHECK(err);
-
-    std::string extension_string(param_size, '\0');
-    err = xpu::ocl::clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_size,
-            &extension_string[0], &param_size);
-    OCL_CHECK(err);
+    std::string extension_string;
+    CHECK(xpu::ocl::get_extensions(device, extension_string));
 
     // convert to ours
     using namespace compute;
diff --git a/src/gpu/intel/ocl/device_info.hpp b/src/gpu/intel/ocl/device_info.hpp
index eb43584810d..eeb55b70951 100644
--- a/src/gpu/intel/ocl/device_info.hpp
+++ b/src/gpu/intel/ocl/device_info.hpp
@@ -22,7 +22,7 @@
 #include <CL/cl.h>
 
 #include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/ocl/engine.cpp b/src/gpu/intel/ocl/engine.cpp
index c3bf7b991b3..a3cbbdfbd7a 100644
--- a/src/gpu/intel/ocl/engine.cpp
+++ b/src/gpu/intel/ocl/engine.cpp
@@ -31,7 +31,7 @@
 #include "gpu/intel/ocl/device_info.hpp"
 #include "gpu/intel/ocl/kernel.hpp"
 #include "gpu/intel/ocl/stream.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -39,33 +39,6 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
-status_t preprocess_headers(stringstream_t &pp_code, const char *code,
-        const compute::kernel_ctx_t &kernel_ctx) {
-    stringstream_t code_stream(code);
-
-    for (std::string line; std::getline(code_stream, line);) {
-        const size_t include_pos = line.find("#include");
-        if (include_pos != std::string::npos) {
-            static constexpr size_t include_len = 8;
-            const size_t first_quote_pos
-                    = line.find("\"", include_pos + include_len);
-            const size_t second_quote_pos
-                    = line.find("\"", first_quote_pos + 1);
-            const size_t kernel_name_len
-                    = second_quote_pos - first_quote_pos - 1;
-            const auto header_name
-                    = line.substr(first_quote_pos + 1, kernel_name_len);
-            const char *header_source
-                    = kernel_ctx.get_custom_header(header_name);
-            if (!header_source) header_source = get_kernel_header(header_name);
-            CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
-        } else {
-            pp_code << line << std::endl;
-        }
-    }
-    return status::success;
-}
-
 status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
         cl_device_id dev, cl_context ctx, size_t index,
         const std::vector<uint8_t> &cache_blob) {
@@ -331,14 +304,14 @@ status_t engine_t::build_program_from_source(
     // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
     // that, a manual preprocessing of `include` header directives in the
     // OpenCL kernels is required.
-    CHECK(preprocess_headers(pp_code, code_string, kernel_ctx));
+    CHECK(compute::preprocess_headers(pp_code, code_string, kernel_ctx));
     std::string pp_code_str = pp_code.str();
     const char *pp_code_str_ptr = pp_code_str.c_str();
 
     src = {pp_code_str};
     if (src) { options += " -g -s " + std::string(src.name()); }
 
-    debugdump_processed_source(
+    compute::debugdump_processed_source(
             pp_code_str, options, dev_info->get_cl_ext_options());
 
     auto ctx = context();
diff --git a/src/gpu/intel/ocl/engine.hpp b/src/gpu/intel/ocl/engine.hpp
index 7d4273dc7c9..00543c476f7 100644
--- a/src/gpu/intel/ocl/engine.hpp
+++ b/src/gpu/intel/ocl/engine.hpp
@@ -21,6 +21,7 @@
 #include "common/utils.hpp"
 #include "gpu/gpu_impl_list.hpp"
 #include "gpu/intel/engine.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "xpu/ocl/engine_impl.hpp"
 #include "xpu/utils.hpp"
 
@@ -30,9 +31,6 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
-status_t preprocess_headers(stringstream_t &pp_code, const char *code,
-        const compute::kernel_ctx_t &kernel_ctx);
-
 status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
         cl_device_id dev, cl_context ctx, size_t index,
         const std::vector<uint8_t> &cache_blob);
diff --git a/src/gpu/intel/ocl/hw_info.cpp b/src/gpu/intel/ocl/hw_info.cpp
index 7872e1b2abb..187687276dd 100644
--- a/src/gpu/intel/ocl/hw_info.cpp
+++ b/src/gpu/intel/ocl/hw_info.cpp
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/hw_info.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 #include "gpu/intel/jit/binary_format.hpp"
 #include "gpu/intel/jit/generator.hpp"
diff --git a/src/gpu/intel/ocl/kernel.cpp b/src/gpu/intel/ocl/kernel.cpp
index 7304338d01f..17bac4daaa3 100644
--- a/src/gpu/intel/ocl/kernel.cpp
+++ b/src/gpu/intel/ocl/kernel.cpp
@@ -31,7 +31,7 @@
 
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/stream.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/ocl/mdapi_utils.cpp b/src/gpu/intel/ocl/mdapi_utils.cpp
index 888f1475acd..732a1897bee 100644
--- a/src/gpu/intel/ocl/mdapi_utils.cpp
+++ b/src/gpu/intel/ocl/mdapi_utils.cpp
@@ -28,7 +28,7 @@
 #include <dlfcn.h>
 #include <vector>
 
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "mdapi/metrics_discovery_api.h"
 
 #ifndef CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL
diff --git a/src/gpu/intel/ocl/stream.cpp b/src/gpu/intel/ocl/stream.cpp
index a086bd20084..213fdd76732 100644
--- a/src/gpu/intel/ocl/stream.cpp
+++ b/src/gpu/intel/ocl/stream.cpp
@@ -26,7 +26,7 @@
 
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/stream.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/ocl/stream.hpp b/src/gpu/intel/ocl/stream.hpp
index 56530fbf072..1da17c0b553 100644
--- a/src/gpu/intel/ocl/stream.hpp
+++ b/src/gpu/intel/ocl/stream.hpp
@@ -26,9 +26,11 @@
 #include "xpu/ocl/context.hpp"
 #include "xpu/ocl/stream_impl.hpp"
 
-#include "gpu/intel/ocl/mdapi_utils.hpp"
 #include "gpu/intel/stream.hpp"
 
+#include "gpu/intel/ocl/mdapi_utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/intel/ocl/utils/CMakeLists.txt b/src/gpu/intel/ocl/utils/CMakeLists.txt
new file mode 100644
index 00000000000..9cca3dd28ac
--- /dev/null
+++ b/src/gpu/intel/ocl/utils/CMakeLists.txt
@@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+
+set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_ocl_utils)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
diff --git a/src/gpu/intel/ocl/utils.cpp b/src/gpu/intel/ocl/utils/utils.cpp
similarity index 82%
rename from src/gpu/intel/ocl/utils.cpp
rename to src/gpu/intel/ocl/utils/utils.cpp
index 5460334e45e..57e456fddc3 100644
--- a/src/gpu/intel/ocl/utils.cpp
+++ b/src/gpu/intel/ocl/utils/utils.cpp
@@ -23,7 +23,7 @@
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/hw_info.hpp"
 #include "gpu/intel/ocl/kernel.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "xpu/ocl/utils.hpp"
 
 #ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
@@ -219,70 +219,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) {
     return status::success;
 }
 
-void debugdump_processed_source(const std::string &source,
-        const std::string &options, const std::string &cl_options) {
-#if defined(__linux__) && defined(DNNL_DEV_MODE)
-    if (get_verbose(verbose_t::debuginfo) >= 10) {
-        auto get_defines = [](const std::string &from) {
-            std::string ret;
-            size_t pos = 0;
-            while (pos < from.length()) {
-                // Find next define argument
-                pos = from.find("-D", pos);
-
-                // Generate argument, quotes are interpreted literally, but
-                // other special shell characters need escaped. Does not
-                // currently handle quotes with the ' character or nested quotes
-                char quote_parity = true;
-                while (pos < from.length()) {
-                    if (quote_parity
-                            && utils::one_of(from[pos], '~', '#', '$', '&', '*',
-                                    '(', ')', '\\', '|', '[', ']', '{', '}',
-                                    ';', '\'', '<', '>', '/', '?', '!')) {
-                        ret += '\\';
-                    }
-                    ret += from[pos];
-                    if (from[pos] == '"') quote_parity ^= true;
-                    if (from[pos] == ' ' && quote_parity) break;
-
-                    pos++;
-                }
-            }
-            return ret;
-        };
-        auto execute_command
-                = [](const std::string &cmd, const std::string &stdin) {
-            std::string result;
-            std::array<char, 256> buffer;
-            FILE *pipe = popen(cmd.c_str(), "w");
-            fputs(stdin.c_str(), pipe);
-            if (pipe) {
-                while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-                    result += buffer.data();
-                }
-            }
-            pclose(pipe);
-            return result;
-        };
-
-        // Run utilities to evaluate preprocessor defines and format the file
-        // Theoretically, we can accomplish this task with libclang, but it
-        // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
-        // so that calls to the system are not included in the default build.
-
-        // Due to the use of a different C preprocessor, warnings should not be
-        // ignored, as they may correspond to a different behavior in the OpenCL
-        // C preprocessor
-        auto o = get_defines(options) + get_defines(cl_options);
-        std::string preprocess_cmd
-                = std::string() + "cpp -P " + o + " | clang-format";
-        execute_command(preprocess_cmd, source);
-        std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
-        std::cout << "OCL_OPTIONS: " << options << std::endl;
-    }
-#endif
-}
-
 status_t get_kernel_arg_types(cl_kernel ocl_kernel,
         std::vector<gpu::intel::compute::scalar_type_t> *arg_types) {
     cl_uint nargs;
diff --git a/src/gpu/intel/ocl/utils.hpp b/src/gpu/intel/ocl/utils/utils.hpp
similarity index 92%
rename from src/gpu/intel/ocl/utils.hpp
rename to src/gpu/intel/ocl/utils/utils.hpp
index 09e4354536b..1746f2bb889 100644
--- a/src/gpu/intel/ocl/utils.hpp
+++ b/src/gpu/intel/ocl/utils/utils.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef GPU_INTEL_OCL_UTILS_HPP
-#define GPU_INTEL_OCL_UTILS_HPP
+#ifndef GPU_INTEL_OCL_UTILS_UTILS_HPP
+#define GPU_INTEL_OCL_UTILS_UTILS_HPP
 
 #include <string.h>
 #include <string>
@@ -55,9 +55,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary);
 status_t get_ocl_program_binary_size(
         cl_kernel kernel, cl_device_id device, size_t *size);
 
-void debugdump_processed_source(const std::string &source,
-        const std::string &options, const std::string &ocl_options);
-
 status_t get_kernel_arg_types(cl_kernel ocl_kernel,
         std::vector<gpu::intel::compute::scalar_type_t> *arg_types);
 
@@ -76,4 +73,4 @@ status_t get_ocl_device_enabled_native_float_atomics(
 } // namespace impl
 } // namespace dnnl
 
-#endif
+#endif // GPU_INTEL_OCL_UTILS_UTILS_HPP
diff --git a/src/gpu/intel/sycl/compat.cpp b/src/gpu/intel/sycl/compat.cpp
index 4227c23058c..d3c6c0613e8 100644
--- a/src/gpu/intel/sycl/compat.cpp
+++ b/src/gpu/intel/sycl/compat.cpp
@@ -35,7 +35,7 @@
 #include "gpu/intel/compute/device_info.hpp"
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
+#include "gpu/intel/sycl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/sycl/device_info.cpp b/src/gpu/intel/sycl/device_info.cpp
index 333db8b6372..99ac9da8706 100644
--- a/src/gpu/intel/sycl/device_info.cpp
+++ b/src/gpu/intel/sycl/device_info.cpp
@@ -19,11 +19,11 @@
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/device_info.hpp"
 #include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
-#include "gpu/intel/sycl/utils.hpp"
 
 #include "gpu/intel/ocl/hw_info.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
+
+#include "gpu/intel/l0/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -56,7 +56,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
         auto ze_dev = xpu::sycl::compat::get_native<ze_device_handle_t>(device);
         auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(ctx);
 
-        status = gpu::intel::sycl::init_gpu_hw_info(engine, ze_dev, ze_ctx,
+        status = gpu::intel::l0::init_gpu_hw_info(engine, ze_dev, ze_ctx,
                 ip_version_, gpu_arch_, gpu_product_, native_extensions_,
                 mayiuse_systolic_, mayiuse_ngen_kernels_);
     } else {
diff --git a/src/gpu/intel/sycl/engine.cpp b/src/gpu/intel/sycl/engine.cpp
index 19a6e92e4b2..ee0fa18798a 100644
--- a/src/gpu/intel/sycl/engine.cpp
+++ b/src/gpu/intel/sycl/engine.cpp
@@ -119,13 +119,20 @@ status_t engine_t::create_kernels(
             "No OpenCL source was found for kernel");
 
     stringstream_t pp_code;
-    CHECK(gpu::intel::ocl::preprocess_headers(pp_code, source, kernel_ctx));
+    CHECK(compute::preprocess_headers(pp_code, source, kernel_ctx));
+    std::string code_str = pp_code.str();
 
     std::string build_options = kernel_ctx.options();
     build_options += " " + device_info()->get_cl_ext_options();
 
+    gpu::intel::compute::program_src_t src(code_str);
+    if (src) { build_options += " -g -s " + std::string(src.name()); }
+
+    compute::debugdump_processed_source(
+            code_str, build_options, device_info()->get_cl_ext_options());
+
     auto kb_src = syclex::create_kernel_bundle_from_source(
-            context(), syclex::source_language::opencl, pp_code.str());
+            context(), syclex::source_language::opencl, code_str);
     auto kb_exe = syclex::build(
             kb_src, syclex::properties {syclex::build_options(build_options)});
     *kernels = std::vector<compute::kernel_t>(kernel_names.size());
@@ -133,8 +140,7 @@ status_t engine_t::create_kernels(
         if (!kernel_names[i]) continue;
 
         CHECK(interop_kernel_t::make((*kernels)[i],
-                kb_exe.ext_oneapi_get_kernel(kernel_names[i]),
-                gpu::intel::compute::program_src_t(pp_code.str())));
+                kb_exe.ext_oneapi_get_kernel(kernel_names[i]), src));
     }
 
     return status::success;
diff --git a/src/gpu/intel/sycl/engine.hpp b/src/gpu/intel/sycl/engine.hpp
index 5a9ddd78874..58b42f6453c 100644
--- a/src/gpu/intel/sycl/engine.hpp
+++ b/src/gpu/intel/sycl/engine.hpp
@@ -31,7 +31,7 @@
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/kernel.hpp"
 
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/utils.hpp"
 
diff --git a/src/gpu/intel/sycl/interop_kernel.cpp b/src/gpu/intel/sycl/interop_kernel.cpp
index 960b2412265..eedef84aacc 100644
--- a/src/gpu/intel/sycl/interop_kernel.cpp
+++ b/src/gpu/intel/sycl/interop_kernel.cpp
@@ -19,8 +19,7 @@
 #include "common/verbose.hpp"
 #include "gpu/intel/compute/types_interop.hpp"
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/ocl/utils.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "gpu/intel/sycl/stream.hpp"
 #include "gpu/intel/sycl/utils.hpp"
 #include "xpu/sycl/c_types_map.hpp"
diff --git a/src/gpu/intel/sycl/l0/utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp
deleted file mode 100644
index 28a7a44f09b..00000000000
--- a/src/gpu/intel/sycl/l0/utils.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/*******************************************************************************
-* Copyright 2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/sycl/l0/utils.hpp"
-#include "oneapi/dnnl/dnnl_config.h"
-
-#include "gpu/intel/jit/binary_format.hpp"
-#include "gpu/intel/jit/utils/type_bridge.hpp"
-#include "ngen_level_zero.hpp"
-
-#include "level_zero/ze_api.h"
-#include "level_zero/ze_intel_gpu.h"
-
-#if !defined(__SYCL_COMPILER_VERSION)
-#error "Unsupported compiler"
-#endif
-
-#if (__SYCL_COMPILER_VERSION < 20200818)
-#error "Level Zero is not supported with this compiler version"
-#endif
-
-#include "common/c_types_map.hpp"
-#include "common/verbose.hpp"
-
-#include "gpu/intel/sycl/utils.hpp"
-#include <sycl/ext/oneapi/backend/level_zero.hpp>
-
-#include "gpu/intel/sycl/engine.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace sycl {
-namespace l0 {
-
-std::string to_string(ze_result_t r) {
-#define ZE_STATUS_CASE(status) \
-    case status: return #status
-    switch (r) {
-        ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
-        ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
-        ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
-        default: return std::to_string((int)r);
-    }
-#undef ZE_STATUS_CASE
-}
-
-#define ZE_CHECK(f) \
-    do { \
-        ze_result_t res_ = (f); \
-        if (res_ != ZE_RESULT_SUCCESS) { \
-            std::string err_str_ = to_string(res_); \
-            VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
-            return status::runtime_error; \
-        } \
-    } while (false)
-
-#if defined(_WIN32)
-#define L0_LIB_NAME "ze_loader.dll"
-#elif defined(__linux__)
-#define L0_LIB_NAME "libze_loader.so.1"
-#endif
-
-template <typename F>
-F find_ze_symbol(const char *symbol) {
-    return (F)xpu::find_symbol(L0_LIB_NAME, symbol);
-}
-#undef L0_LIB_NAME
-
-#define INDIRECT_L0_CALL(f) \
-    template <typename... Args> \
-    status_t f(Args &&...args) { \
-        const ze_init_flags_t default_ze_flags = 0; \
-        static auto init_ = find_ze_symbol<decltype(&::zeInit)>("zeInit"); \
-        if (!init_) return status::runtime_error; \
-        ZE_CHECK(init_(default_ze_flags)); \
-        static auto f_ = find_ze_symbol<decltype(&::f)>(#f); \
-        if (!f_) return status::runtime_error; \
-        ZE_CHECK(f_(std::forward<Args>(args)...)); \
-        return status::success; \
-    }
-INDIRECT_L0_CALL(zeModuleCreate)
-INDIRECT_L0_CALL(zeDeviceGetProperties)
-INDIRECT_L0_CALL(zeDeviceGetModuleProperties)
-INDIRECT_L0_CALL(zeKernelCreate)
-INDIRECT_L0_CALL(zeKernelGetBinaryExp)
-INDIRECT_L0_CALL(zeModuleGetNativeBinary)
-#undef INDIRECT_L0_CALL
-
-} // namespace l0
-
-// FIXME: Currently SYCL doesn't provide any API to get device UUID so
-// we query it directly from Level0 with the zeDeviceGetProperties function.
-// The `get_device_uuid` function packs 128 bits of the device UUID, which are
-// represented as an uint8_t array of size 16, to 2 uint64_t values.
-xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
-    static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
-            "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
-
-    auto ze_device_properties = ze_device_properties_t();
-    ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-
-    auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(dev);
-    auto status = l0::zeDeviceGetProperties(ze_device, &ze_device_properties);
-    MAYBE_UNUSED(status);
-    assert(status == status::success);
-
-    const auto &ze_device_id = ze_device_properties.uuid.id;
-
-    uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
-    for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
-        size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
-        uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift);
-    }
-    return xpu::device_uuid_t(uuid[0], uuid[1]);
-}
-
-status_t sycl_create_kernels_with_level_zero(
-        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
-        const std::vector<const char *> &kernel_names,
-        const gpu::intel::sycl::engine_t *sycl_engine,
-        const xpu::binary_t &binary) {
-    auto desc = ze_module_desc_t();
-    desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
-    desc.format = ZE_MODULE_FORMAT_NATIVE;
-    desc.inputSize = binary.size();
-    desc.pInputModule = binary.data();
-    desc.pBuildFlags = "";
-    desc.pConstants = nullptr;
-
-    ze_module_handle_t ze_module;
-
-    auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
-            sycl_engine->device());
-    auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
-            sycl_engine->context());
-
-    CHECK(l0::zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr));
-    ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
-            = ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
-                    ::sycl::bundle_state::executable>(
-                    {ze_module}, sycl_engine->context());
-
-    sycl_kernels.resize(kernel_names.size());
-    for (size_t i = 0; i < kernel_names.size(); i++) {
-        if (kernel_names[i] == nullptr) continue;
-        ze_kernel_handle_t ze_kernel;
-        ze_kernel_desc_t ze_kernel_desc {
-                ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_names[i]};
-        CHECK(l0::zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel));
-        auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
-                {kernel_bundle, ze_kernel}, sycl_engine->context());
-        sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
-    }
-
-    return status::success;
-}
-
-status_t get_l0_kernel_binary(
-        const ::sycl::kernel &kernel, xpu::binary_t &binary) {
-#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-    auto l0_kernel = ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>(
-            kernel);
-    size_t binary_size = 0;
-    CHECK(l0::zeKernelGetBinaryExp(l0_kernel, &binary_size, nullptr));
-    binary.resize(binary_size);
-    CHECK(l0::zeKernelGetBinaryExp(l0_kernel, &binary_size, binary.data()));
-#else
-    auto bundle = kernel.get_kernel_bundle();
-    auto module_vec
-            = ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>(
-                    bundle);
-    auto module = module_vec[0];
-    size_t module_binary_size;
-    CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr));
-    binary.resize(module_binary_size);
-    CHECK(l0::zeModuleGetNativeBinary(
-            module, &module_binary_size, binary.data()));
-#endif
-    return status::success;
-}
-
-bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
-    auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
-    auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
-
-    return lhs_ze_handle == rhs_ze_handle;
-}
-
-status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
-    auto devicePropsIP = ze_device_ip_version_ext_t();
-    devicePropsIP.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
-
-    auto deviceProps = ze_device_properties_t();
-    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    deviceProps.pNext = &devicePropsIP;
-
-    CHECK(l0::zeDeviceGetProperties(device, &deviceProps));
-    ip_version = devicePropsIP.ipVersion;
-    return status::success;
-}
-
-status_t get_l0_device_enabled_systolic_intel(
-        ze_device_handle_t device, bool &mayiuse_systolic) {
-    // Note: supported by Intel Driver 24.05 and onwards
-    auto deviceModPropsExt = ze_intel_device_module_dp_exp_properties_t();
-    deviceModPropsExt.stype
-            = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
-
-    auto deviceModProps = ze_device_module_properties_t();
-    deviceModProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
-    deviceModProps.pNext = &deviceModPropsExt;
-
-    CHECK(l0::zeDeviceGetModuleProperties(device, &deviceModProps));
-    mayiuse_systolic
-            = deviceModPropsExt.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
-    return status::success;
-}
-
-status_t get_l0_device_enabled_native_float_atomics(
-        ze_device_handle_t device, uint64_t &native_extensions) {
-    using namespace gpu::intel::compute;
-
-    auto fltAtom = ze_float_atomic_ext_properties_t();
-    fltAtom.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
-
-    auto deviceProps = ze_device_module_properties_t();
-    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
-    deviceProps.pNext = &fltAtom;
-
-    CHECK(l0::zeDeviceGetModuleProperties(device, &deviceProps));
-
-    ze_device_fp_atomic_ext_flags_t atomic_load_store
-            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
-            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
-    ze_device_fp_atomic_ext_flags_t atomic_add
-            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
-            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
-    ze_device_fp_atomic_ext_flags_t atomic_min_max
-            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
-            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
-
-    if ((fltAtom.fp16Flags & atomic_load_store) == atomic_load_store)
-        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
-    if ((fltAtom.fp16Flags & atomic_add) == atomic_add)
-        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
-    if ((fltAtom.fp16Flags & atomic_min_max) == atomic_min_max)
-        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
-
-    if ((fltAtom.fp32Flags & atomic_load_store) == atomic_load_store)
-        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
-    if ((fltAtom.fp32Flags & atomic_add) == atomic_add)
-        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
-    if ((fltAtom.fp32Flags & atomic_min_max) == atomic_min_max)
-        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
-
-    if ((fltAtom.fp64Flags & atomic_load_store) == atomic_load_store)
-        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
-    if ((fltAtom.fp64Flags & atomic_add) == atomic_add)
-        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
-    if ((fltAtom.fp64Flags & atomic_min_max) == atomic_min_max)
-        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
-
-    return status::success;
-}
-
-status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
-    auto eucnt = ze_eu_count_ext_t();
-    auto deviceProps = ze_device_properties_t();
-    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    deviceProps.pNext = &eucnt;
-
-    CHECK(l0::zeDeviceGetProperties(device, &deviceProps));
-    eu_count = eucnt.numTotalEUs;
-    return status::success;
-}
-
-status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
-        ze_context_handle_t context, uint32_t &ip_version,
-        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
-        uint64_t &native_extensions, bool &mayiuse_systolic,
-        bool &mayiuse_ngen_kernels) {
-    using namespace ngen;
-    ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
-            context, device);
-
-    gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
-    std::memcpy(&product_, &product, sizeof(ngen::Product));
-
-    mayiuse_systolic = false;
-    if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
-            != status::success)
-        mayiuse_systolic = false;
-
-    /* Some old drivers do not report systolic availability. Manually override
-       systolic availability based on product family. */
-    switch (product.family) {
-        case ProductFamily::DG2:
-        case ProductFamily::ARL:
-        case ProductFamily::PVC: mayiuse_systolic = true;
-        default: break;
-    }
-
-    CHECK(get_l0_device_enabled_native_float_atomics(
-            device, native_extensions));
-
-    auto status
-            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
-    if (status != status::success) mayiuse_ngen_kernels = false;
-
-    ip_version = 0;
-    return get_device_ip(device, ip_version);
-}
-
-} // namespace sycl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/sycl/l0/utils.hpp b/src/gpu/intel/sycl/l0/utils.hpp
deleted file mode 100644
index e65932fdb1e..00000000000
--- a/src/gpu/intel/sycl/l0/utils.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
-* Copyright 2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_SYCL_L0_UTILS_HPP
-#define GPU_INTEL_SYCL_L0_UTILS_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gpu/intel/compute/kernel.hpp"
-#include "gpu/intel/sycl/compat.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace sycl {
-
-class engine_t;
-
-xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
-
-status_t sycl_create_kernels_with_level_zero(
-        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
-        const std::vector<const char *> &kernel_names,
-        const gpu::intel::sycl::engine_t *sycl_engine,
-        const xpu::binary_t &binary);
-
-status_t get_l0_kernel_binary(
-        const ::sycl::kernel &kernel, xpu::binary_t &binary);
-
-bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
-
-status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
-        ze_context_handle_t context, uint32_t &ip_version,
-        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
-        uint64_t &native_extensions, bool &mayiuse_systolic,
-        bool &mayiuse_ngen_kernels);
-
-} // namespace sycl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_SYCL_L0_UTILS_HPP
diff --git a/src/gpu/intel/sycl/stream.cpp b/src/gpu/intel/sycl/stream.cpp
index 9312e35c7d6..6b1ce0c5a61 100644
--- a/src/gpu/intel/sycl/stream.cpp
+++ b/src/gpu/intel/sycl/stream.cpp
@@ -24,7 +24,7 @@
 
 #include "gpu/intel/sycl/stream.hpp"
 
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/sycl/stream.hpp b/src/gpu/intel/sycl/stream.hpp
index 6331884c1fc..c9c1a8b196a 100644
--- a/src/gpu/intel/sycl/stream.hpp
+++ b/src/gpu/intel/sycl/stream.hpp
@@ -35,6 +35,8 @@
 #include "gpu/intel/engine.hpp"
 #include "gpu/intel/stream.hpp"
 
+#include "gpu/intel/ocl/utils/utils.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/intel/sycl/utils.cpp b/src/gpu/intel/sycl/utils.cpp
index 2bbf1e424ef..c16b0056098 100644
--- a/src/gpu/intel/sycl/utils.cpp
+++ b/src/gpu/intel/sycl/utils.cpp
@@ -17,9 +17,9 @@
 #include "gpu/intel/sycl/utils.hpp"
 
 #include "gpu/intel/compute/ukernels.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
 #include "xpu/ocl/engine_factory.hpp"
 #include "xpu/ocl/utils.hpp"
 #include "xpu/sycl/compat.hpp"
@@ -32,6 +32,53 @@ namespace gpu {
 namespace intel {
 namespace sycl {
 
+// FIXME: Currently SYCL doesn't provide any API to get device UUID so
+// we query it directly from Level0 with the zeDeviceGetProperties function.
+// The `get_device_uuid` function packs 128 bits of the device UUID, which are
+// represented as an uint8_t array of size 16, to 2 uint64_t values.
+xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
+    return gpu::intel::l0::get_device_uuid(
+            xpu::sycl::compat::get_native<ze_device_handle_t>(dev));
+}
+
+bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
+    auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
+    auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
+
+    return lhs_ze_handle == rhs_ze_handle;
+}
+
+status_t sycl_create_kernels_with_level_zero(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
+        const gpu::intel::sycl::engine_t *sycl_engine,
+        const xpu::binary_t &binary) {
+    auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
+            sycl_engine->device());
+    auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
+            sycl_engine->context());
+    ze_module_handle_t ze_module = nullptr;
+    std::vector<ze_kernel_handle_t> ze_kernels;
+
+    gpu::intel::l0::create_kernels(
+            ze_device, ze_ctx, kernel_names, binary, &ze_module, ze_kernels);
+
+    ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
+            = ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
+                    ::sycl::bundle_state::executable>(
+                    {ze_module}, sycl_engine->context());
+
+    sycl_kernels.resize(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (kernel_names[i] == nullptr) continue;
+        auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
+                {kernel_bundle, ze_kernels[i]}, sycl_engine->context());
+        sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
+    }
+
+    return status::success;
+}
+
 ::sycl::nd_range<3> to_sycl_nd_range(
         const gpu::intel::compute::nd_range_t &range) {
     const auto &local_range = range.local_range();
@@ -150,7 +197,6 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) {
     }
 
     *ocl_dev = d;
-
     return status::success;
 }
 
@@ -204,6 +250,31 @@ status_t create_ocl_engine(
     const auto &sycl_ctx = engine->context();
     return create_ocl_engine(ocl_engine, engine->device(), &sycl_ctx);
 }
+
+static status_t get_l0_kernel_binary(
+        const ::sycl::kernel &kernel, xpu::binary_t &binary) {
+    auto bundle = kernel.get_kernel_bundle();
+    auto module_vec
+            = ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>(
+                    bundle);
+    auto l0_module = module_vec[0];
+    CHECK(l0::get_module_binary(l0_module, binary));
+
+    std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t> ocl_engine;
+    const auto &devs = kernel.get_context().get_devices();
+    CHECK(create_ocl_engine(&ocl_engine, devs[0]));
+    xpu::ocl::wrapper_t<cl_program> ocl_program;
+    CHECK(xpu::ocl::create_program(
+            ocl_program, ocl_engine->device(), ocl_engine->context(), binary));
+
+    cl_int err;
+    auto name = kernel.get_info<::sycl::info::kernel::function_name>();
+    auto ocl_kernel = xpu::ocl::make_wrapper(
+            xpu::ocl::clCreateKernel(ocl_program, name.c_str(), &err));
+    OCL_CHECK(err);
+    CHECK(gpu::intel::ocl::get_ocl_kernel_binary(ocl_kernel, binary));
+    return status::success;
+}
 #endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
 
 status_t get_kernel_binary(
@@ -212,37 +283,25 @@ status_t get_kernel_binary(
     assert(!devs.empty());
     switch (xpu::sycl::get_backend(devs[0])) {
         case xpu::sycl::backend_t::level0: {
+#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
+            auto l0_kernel = ::sycl::get_native<
+                    ::sycl::backend::ext_oneapi_level_zero>(kernel);
+            CHECK(l0::get_kernel_binary(l0_kernel, binary));
+#else
             CHECK(get_l0_kernel_binary(kernel, binary));
-#ifndef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-            {
-                std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
-                        ocl_engine;
-                CHECK(create_ocl_engine(&ocl_engine, devs[0]));
-                xpu::ocl::wrapper_t<cl_program> ocl_program;
-                CHECK(xpu::ocl::create_program(ocl_program,
-                        ocl_engine->device(), ocl_engine->context(), binary));
-
-                cl_int err;
-                auto name = kernel.get_info<
-                        ::sycl::info::kernel::function_name>();
-                auto ocl_kernel
-                        = xpu::ocl::make_wrapper(xpu::ocl::clCreateKernel(
-                                ocl_program, name.c_str(), &err));
-                OCL_CHECK(err);
-                CHECK(gpu::intel::ocl::get_ocl_kernel_binary(
-                        ocl_kernel, binary));
-            }
-#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-            return status::success;
+#endif
+            break;
         }
         case xpu::sycl::backend_t::opencl: {
             auto ocl_kernel
                     = ::sycl::get_native<::sycl::backend::opencl>(kernel);
             CHECK(gpu::intel::ocl::get_ocl_kernel_binary(ocl_kernel, binary));
-            return status::success;
+            break;
         }
         default: return status::runtime_error;
     }
+
+    return status::success;
 }
 
 gpu_utils::device_id_t device_id(const ::sycl::device &dev) {
diff --git a/src/gpu/intel/sycl/utils.hpp b/src/gpu/intel/sycl/utils.hpp
index 84c6d00bbe2..919fd13677c 100644
--- a/src/gpu/intel/sycl/utils.hpp
+++ b/src/gpu/intel/sycl/utils.hpp
@@ -29,6 +29,16 @@ namespace sycl {
 
 class engine_t;
 
+xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
+
+status_t sycl_create_kernels_with_level_zero(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
+        const gpu::intel::sycl::engine_t *sycl_engine,
+        const xpu::binary_t &binary);
+
+bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
+
 ::sycl::nd_range<3> to_sycl_nd_range(
         const gpu::intel::compute::nd_range_t &range);
 
diff --git a/src/graph/backend/dnnl/scratchpad.hpp b/src/graph/backend/dnnl/scratchpad.hpp
index b797db0b0a9..0b3cbe25c0c 100644
--- a/src/graph/backend/dnnl/scratchpad.hpp
+++ b/src/graph/backend/dnnl/scratchpad.hpp
@@ -93,7 +93,6 @@ class temporary_scratchpad_t : public scratchpad_t {
 #ifdef DNNL_WITH_SYCL
     void set_deps(::sycl::event event) { e_ = std::move(event); }
 #endif
-
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     void set_deps(cl_event event) { ocl_e_ = event; }
 #endif
@@ -106,7 +105,6 @@ class temporary_scratchpad_t : public scratchpad_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event e_;
 #endif
-
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event ocl_e_;
 #endif
diff --git a/src/graph/utils/ocl_check.hpp b/src/graph/utils/ocl_check.hpp
index 7c146c34df0..92d41793412 100644
--- a/src/graph/utils/ocl_check.hpp
+++ b/src/graph/utils/ocl_check.hpp
@@ -31,6 +31,6 @@
 #endif
 #endif
 
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 #endif
diff --git a/src/xpu/ocl/buffer_memory_storage.hpp b/src/xpu/ocl/buffer_memory_storage.hpp
index 42b4f1732d2..e4e6cd0c528 100644
--- a/src/xpu/ocl/buffer_memory_storage.hpp
+++ b/src/xpu/ocl/buffer_memory_storage.hpp
@@ -24,7 +24,7 @@
 
 #include "xpu/ocl/memory_storage_base.hpp"
 
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/xpu/ocl/utils.cpp b/src/xpu/ocl/utils.cpp
index f23e112e910..312c60cf9f9 100644
--- a/src/xpu/ocl/utils.cpp
+++ b/src/xpu/ocl/utils.cpp
@@ -14,14 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-// Include for:
-// - CL_PLATFORM_NOT_FOUND_KHR
-// - CL_UUID_SIZE_KHR
-// - CL_DEVICE_UUID_KHR
-#include <CL/cl_ext.h>
-
-#include "xpu/ocl/engine_impl.hpp"
 #include "xpu/ocl/utils.hpp"
+#include "xpu/ocl/engine_impl.hpp"
 
 // XXX: Include this header for VERROR_ENGINE.
 // TODO: Move VERROR_ENGINE and other similar macros to a separate file.
@@ -32,133 +26,6 @@ namespace impl {
 namespace xpu {
 namespace ocl {
 
-status_t convert_to_dnnl(cl_int cl_status) {
-    switch (cl_status) {
-        case CL_SUCCESS: return status::success;
-        case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-        case CL_OUT_OF_RESOURCES:
-        case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
-        case CL_DEVICE_NOT_FOUND:
-        case CL_DEVICE_NOT_AVAILABLE:
-        case CL_COMPILER_NOT_AVAILABLE:
-        case CL_PROFILING_INFO_NOT_AVAILABLE:
-        case CL_MEM_COPY_OVERLAP:
-        case CL_IMAGE_FORMAT_MISMATCH:
-        case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-        case CL_BUILD_PROGRAM_FAILURE:
-        case CL_MAP_FAILURE:
-        case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-        case CL_COMPILE_PROGRAM_FAILURE:
-        case CL_LINKER_NOT_AVAILABLE:
-        case CL_LINK_PROGRAM_FAILURE:
-        case CL_DEVICE_PARTITION_FAILED:
-        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-        case CL_INVALID_PLATFORM:
-        case CL_INVALID_DEVICE:
-        case CL_SYMBOL_NOT_FOUND: return status::runtime_error;
-        case CL_INVALID_VALUE:
-        case CL_INVALID_DEVICE_TYPE:
-        case CL_INVALID_CONTEXT:
-        case CL_INVALID_QUEUE_PROPERTIES:
-        case CL_INVALID_COMMAND_QUEUE:
-        case CL_INVALID_HOST_PTR:
-        case CL_INVALID_MEM_OBJECT:
-        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-        case CL_INVALID_IMAGE_SIZE:
-        case CL_INVALID_SAMPLER:
-        case CL_INVALID_BINARY:
-        case CL_INVALID_BUILD_OPTIONS:
-        case CL_INVALID_PROGRAM:
-        case CL_INVALID_PROGRAM_EXECUTABLE:
-        case CL_INVALID_KERNEL_NAME:
-        case CL_INVALID_KERNEL_DEFINITION:
-        case CL_INVALID_KERNEL:
-        case CL_INVALID_ARG_INDEX:
-        case CL_INVALID_ARG_VALUE:
-        case CL_INVALID_ARG_SIZE:
-        case CL_INVALID_KERNEL_ARGS:
-        case CL_INVALID_WORK_DIMENSION:
-        case CL_INVALID_WORK_GROUP_SIZE:
-        case CL_INVALID_WORK_ITEM_SIZE:
-        case CL_INVALID_GLOBAL_OFFSET:
-        case CL_INVALID_EVENT_WAIT_LIST:
-        case CL_INVALID_EVENT:
-        case CL_INVALID_OPERATION:
-        case CL_INVALID_GL_OBJECT:
-        case CL_INVALID_BUFFER_SIZE:
-        case CL_INVALID_MIP_LEVEL:
-        case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
-
-        default: return status::runtime_error;
-    }
-}
-
-// Ordered by value as defined by opencl
-const char *convert_cl_int_to_str(cl_int cl_status) {
-#define CL_STATUS_CASE(status) \
-    case status: return #status
-    switch (cl_status) {
-        CL_STATUS_CASE(CL_SUCCESS);
-        CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
-        CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
-        CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
-        CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
-        CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
-        CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
-        CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
-        CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
-        CL_STATUS_CASE(CL_MAP_FAILURE);
-        CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
-        CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
-        CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
-        CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
-        CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
-        CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_INVALID_VALUE);
-        CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
-        CL_STATUS_CASE(CL_INVALID_PLATFORM);
-        CL_STATUS_CASE(CL_INVALID_DEVICE);
-        CL_STATUS_CASE(CL_INVALID_CONTEXT);
-        CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
-        CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
-        CL_STATUS_CASE(CL_INVALID_HOST_PTR);
-        CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
-        CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-        CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
-        CL_STATUS_CASE(CL_INVALID_SAMPLER);
-        CL_STATUS_CASE(CL_INVALID_BINARY);
-        CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
-        CL_STATUS_CASE(CL_INVALID_PROGRAM);
-        CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
-        CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
-        CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
-        CL_STATUS_CASE(CL_INVALID_KERNEL);
-        CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
-        CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
-        CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
-        CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
-        CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
-        CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
-        CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
-        CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
-        CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
-        CL_STATUS_CASE(CL_INVALID_EVENT);
-        CL_STATUS_CASE(CL_INVALID_OPERATION);
-        CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
-        CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
-        CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
-        CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
-        CL_STATUS_CASE(CL_SYMBOL_NOT_FOUND);
-#undef CL_STATUS_CASE
-        default: return "unknown macro name";
-    }
-}
-
 std::string get_kernel_name(cl_kernel kernel) {
     size_t name_size;
     cl_int err = xpu::ocl::clGetKernelInfo(
@@ -179,75 +46,6 @@ std::string get_kernel_name(cl_kernel kernel) {
     return name;
 }
 
-static std::string get_platform_name(cl_platform_id platform) {
-    size_t name_size;
-    cl_int err = xpu::ocl::clGetPlatformInfo(
-            platform, CL_PLATFORM_NAME, 0, nullptr, &name_size);
-    // Ignore error.
-    UNUSED_OCL_RESULT(err);
-
-    // Include null terminator explicitly - to safely overwrite it in
-    // clGetPlatformInfo
-    std::string name(name_size, 0);
-    err = xpu::ocl::clGetPlatformInfo(
-            platform, CL_PLATFORM_NAME, name_size, &name[0], nullptr);
-    // Ignore error.
-    UNUSED_OCL_RESULT(err);
-
-    // Remove the null terminator as std::string already includes it
-    name.resize(name_size - 1);
-    return name;
-}
-
-static bool is_intel_platform(cl_platform_id platform) {
-    auto name = get_platform_name(platform);
-    return name.find("Intel") != std::string::npos;
-}
-
-status_t get_devices(std::vector<cl_device_id> *devices,
-        cl_device_type device_type, cl_uint vendor_id /* = 0x8086 */) {
-    cl_uint num_platforms = 0;
-
-    cl_int err = xpu::ocl::clGetPlatformIDs(0, nullptr, &num_platforms);
-    // No platforms - a valid scenario
-    if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success;
-
-    OCL_CHECK(err);
-
-    std::vector<cl_platform_id> platforms(num_platforms);
-    OCL_CHECK(
-            xpu::ocl::clGetPlatformIDs(num_platforms, &platforms[0], nullptr));
-
-    for (size_t i = 0; i < platforms.size(); ++i) {
-        if (!is_intel_platform(platforms[i])) continue;
-
-        cl_uint num_devices = 0;
-        cl_int err = xpu::ocl::clGetDeviceIDs(
-                platforms[i], device_type, 0, nullptr, &num_devices);
-
-        if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) {
-            return status::runtime_error;
-        }
-
-        if (num_devices != 0) {
-            std::vector<cl_device_id> plat_devices;
-            plat_devices.resize(num_devices);
-            OCL_CHECK(xpu::ocl::clGetDeviceIDs(platforms[i], device_type,
-                    num_devices, &plat_devices[0], nullptr));
-
-            // Use the devices for the requested vendor only.
-            for (size_t j = 0; j < plat_devices.size(); ++j) {
-                cl_uint v_id;
-                OCL_CHECK(xpu::ocl::clGetDeviceInfo(plat_devices[j],
-                        CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &v_id, nullptr));
-                if (v_id == vendor_id) { devices->push_back(plat_devices[j]); }
-            }
-        }
-    }
-    // No devices found but still return success
-    return status::success;
-}
-
 status_t get_devices(std::vector<cl_device_id> *devices,
         std::vector<wrapper_t<cl_device_id>> *sub_devices,
         cl_device_type device_type) {
@@ -335,31 +133,6 @@ status_t create_program(ocl::wrapper_t<cl_program> &ocl_program,
     return status::success;
 }
 
-#ifndef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-status_t get_device_uuid(xpu::device_uuid_t &uuid, cl_device_id ocl_dev) {
-    // This function is used only with SYCL that works with OpenCL 3.0
-    // that supports `cl_khr_device_uuid` extension.
-#if defined(cl_khr_device_uuid)
-    static_assert(
-            CL_UUID_SIZE_KHR == 16, "CL_UUID_SIZE_KHR is expected to be 16");
-
-    cl_uchar ocl_dev_uuid[CL_UUID_SIZE_KHR] = {};
-    OCL_CHECK(xpu::ocl::clGetDeviceInfo(ocl_dev, CL_DEVICE_UUID_KHR,
-            CL_UUID_SIZE_KHR, ocl_dev_uuid, nullptr));
-
-    uint64_t uuid_packed[CL_UUID_SIZE_KHR / sizeof(uint64_t)] = {};
-    for (size_t i = 0; i < CL_UUID_SIZE_KHR; ++i) {
-        size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
-        uuid_packed[i / sizeof(uint64_t)]
-                |= (((uint64_t)ocl_dev_uuid[i]) << shift);
-    }
-    uuid = xpu::device_uuid_t(uuid_packed[0], uuid_packed[1]);
-    return status::success;
-#endif
-    return status::runtime_error;
-}
-#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-
 status_t check_device(
         engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) {
     assert(dev && ctx);
diff --git a/src/xpu/ocl/utils.hpp b/src/xpu/ocl/utils.hpp
index 540d5224ad8..ececb6fcd3d 100644
--- a/src/xpu/ocl/utils.hpp
+++ b/src/xpu/ocl/utils.hpp
@@ -18,6 +18,11 @@
 #define XPU_OCL_UTILS_HPP
 
 #include <CL/cl.h>
+// Include for:
+// - CL_PLATFORM_NOT_FOUND_KHR
+// - CL_UUID_SIZE_KHR
+// - CL_DEVICE_UUID_KHR
+#include <CL/cl_ext.h>
 
 #include "oneapi/dnnl/dnnl_config.h"
 
@@ -38,8 +43,134 @@ namespace impl {
 namespace xpu {
 namespace ocl {
 
-status_t convert_to_dnnl(cl_int cl_status);
-const char *convert_cl_int_to_str(cl_int cl_status);
+enum { CL_SYMBOL_NOT_FOUND = -128 };
+
+inline status_t convert_to_dnnl(cl_int cl_status) {
+    switch (cl_status) {
+        case CL_SUCCESS: return status::success;
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        case CL_OUT_OF_RESOURCES:
+        case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
+        case CL_DEVICE_NOT_FOUND:
+        case CL_DEVICE_NOT_AVAILABLE:
+        case CL_COMPILER_NOT_AVAILABLE:
+        case CL_PROFILING_INFO_NOT_AVAILABLE:
+        case CL_MEM_COPY_OVERLAP:
+        case CL_IMAGE_FORMAT_MISMATCH:
+        case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+        case CL_BUILD_PROGRAM_FAILURE:
+        case CL_MAP_FAILURE:
+        case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+        case CL_COMPILE_PROGRAM_FAILURE:
+        case CL_LINKER_NOT_AVAILABLE:
+        case CL_LINK_PROGRAM_FAILURE:
+        case CL_DEVICE_PARTITION_FAILED:
+        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+        case CL_INVALID_PLATFORM:
+        case CL_INVALID_DEVICE:
+        case CL_SYMBOL_NOT_FOUND: return status::runtime_error;
+        case CL_INVALID_VALUE:
+        case CL_INVALID_DEVICE_TYPE:
+        case CL_INVALID_CONTEXT:
+        case CL_INVALID_QUEUE_PROPERTIES:
+        case CL_INVALID_COMMAND_QUEUE:
+        case CL_INVALID_HOST_PTR:
+        case CL_INVALID_MEM_OBJECT:
+        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+        case CL_INVALID_IMAGE_SIZE:
+        case CL_INVALID_SAMPLER:
+        case CL_INVALID_BINARY:
+        case CL_INVALID_BUILD_OPTIONS:
+        case CL_INVALID_PROGRAM:
+        case CL_INVALID_PROGRAM_EXECUTABLE:
+        case CL_INVALID_KERNEL_NAME:
+        case CL_INVALID_KERNEL_DEFINITION:
+        case CL_INVALID_KERNEL:
+        case CL_INVALID_ARG_INDEX:
+        case CL_INVALID_ARG_VALUE:
+        case CL_INVALID_ARG_SIZE:
+        case CL_INVALID_KERNEL_ARGS:
+        case CL_INVALID_WORK_DIMENSION:
+        case CL_INVALID_WORK_GROUP_SIZE:
+        case CL_INVALID_WORK_ITEM_SIZE:
+        case CL_INVALID_GLOBAL_OFFSET:
+        case CL_INVALID_EVENT_WAIT_LIST:
+        case CL_INVALID_EVENT:
+        case CL_INVALID_OPERATION:
+        case CL_INVALID_GL_OBJECT:
+        case CL_INVALID_BUFFER_SIZE:
+        case CL_INVALID_MIP_LEVEL:
+        case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
+
+        default: return status::runtime_error;
+    }
+}
+
+// Ordered by value as defined by opencl
+inline const char *convert_cl_int_to_str(cl_int cl_status) {
+#define CL_STATUS_CASE(status) \
+    case status: return #status
+    switch (cl_status) {
+        CL_STATUS_CASE(CL_SUCCESS);
+        CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
+        CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
+        CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
+        CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
+        CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
+        CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
+        CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+        CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
+        CL_STATUS_CASE(CL_MAP_FAILURE);
+        CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
+        CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+        CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
+        CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
+        CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
+        CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_INVALID_VALUE);
+        CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
+        CL_STATUS_CASE(CL_INVALID_PLATFORM);
+        CL_STATUS_CASE(CL_INVALID_DEVICE);
+        CL_STATUS_CASE(CL_INVALID_CONTEXT);
+        CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
+        CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
+        CL_STATUS_CASE(CL_INVALID_HOST_PTR);
+        CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
+        CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+        CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
+        CL_STATUS_CASE(CL_INVALID_SAMPLER);
+        CL_STATUS_CASE(CL_INVALID_BINARY);
+        CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
+        CL_STATUS_CASE(CL_INVALID_PROGRAM);
+        CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
+        CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
+        CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
+        CL_STATUS_CASE(CL_INVALID_KERNEL);
+        CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
+        CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
+        CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
+        CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
+        CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
+        CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
+        CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
+        CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
+        CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
+        CL_STATUS_CASE(CL_INVALID_EVENT);
+        CL_STATUS_CASE(CL_INVALID_OPERATION);
+        CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
+        CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
+        CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
+        CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
+        CL_STATUS_CASE(CL_SYMBOL_NOT_FOUND);
+#undef CL_STATUS_CASE
+        default: return "unknown macro name";
+    }
+}
 
 #define MAYBE_REPORT_ERROR(msg) \
     do { \
@@ -91,8 +222,6 @@ F find_ocl_symbol(const char *symbol) {
 }
 #undef OCL_LIB_NAME
 
-enum { CL_SYMBOL_NOT_FOUND = -128 };
-
 // In case the OCL symbol is not found:
 // - if the return value of OCL function is cl_int, return CL_SYMBOL_NOT_FOUND
 // - if the return value of OCL function is a pointer, return nullptr
@@ -394,8 +523,74 @@ struct ext_func_t {
 
 std::string get_kernel_name(cl_kernel kernel);
 
-status_t get_devices(std::vector<cl_device_id> *devices,
-        cl_device_type device_type, cl_uint vendor_id = 0x8086);
+static inline std::string get_platform_name(cl_platform_id platform) {
+    size_t name_size;
+    cl_int err = xpu::ocl::clGetPlatformInfo(
+            platform, CL_PLATFORM_NAME, 0, nullptr, &name_size);
+    // Ignore error.
+    UNUSED_OCL_RESULT(err);
+
+    // Include null terminator explicitly - to safely overwrite it in
+    // clGetPlatformInfo
+    std::string name(name_size, 0);
+    err = xpu::ocl::clGetPlatformInfo(
+            platform, CL_PLATFORM_NAME, name_size, &name[0], nullptr);
+    // Ignore error.
+    UNUSED_OCL_RESULT(err);
+
+    // Remove the null terminator as std::string already includes it
+    name.resize(name_size - 1);
+    return name;
+}
+
+static inline bool is_intel_platform(cl_platform_id platform) {
+    auto name = get_platform_name(platform);
+    return name.find("Intel") != std::string::npos;
+}
+
+inline status_t get_devices(std::vector<cl_device_id> *devices,
+        cl_device_type device_type, cl_uint vendor_id = 0x8086) {
+    cl_uint num_platforms = 0;
+
+    cl_int err = xpu::ocl::clGetPlatformIDs(0, nullptr, &num_platforms);
+    // No platforms - a valid scenario
+    if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success;
+
+    OCL_CHECK(err);
+
+    std::vector<cl_platform_id> platforms(num_platforms);
+    OCL_CHECK(
+            xpu::ocl::clGetPlatformIDs(num_platforms, &platforms[0], nullptr));
+
+    for (size_t i = 0; i < platforms.size(); ++i) {
+        if (!is_intel_platform(platforms[i])) continue;
+
+        cl_uint num_devices = 0;
+        cl_int err = xpu::ocl::clGetDeviceIDs(
+                platforms[i], device_type, 0, nullptr, &num_devices);
+
+        if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) {
+            return status::runtime_error;
+        }
+
+        if (num_devices != 0) {
+            std::vector<cl_device_id> plat_devices;
+            plat_devices.resize(num_devices);
+            OCL_CHECK(xpu::ocl::clGetDeviceIDs(platforms[i], device_type,
+                    num_devices, &plat_devices[0], nullptr));
+
+            // Use the devices for the requested vendor only.
+            for (size_t j = 0; j < plat_devices.size(); ++j) {
+                cl_uint v_id;
+                OCL_CHECK(xpu::ocl::clGetDeviceInfo(plat_devices[j],
+                        CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &v_id, nullptr));
+                if (v_id == vendor_id) { devices->push_back(plat_devices[j]); }
+            }
+        }
+    }
+    // No devices found but still return success
+    return status::success;
+}
 
 status_t get_devices(std::vector<cl_device_id> *devices,
         std::vector<wrapper_t<cl_device_id>> *sub_devices,
@@ -409,9 +604,29 @@ cl_platform_id get_platform(engine_t *engine);
 status_t create_program(ocl::wrapper_t<cl_program> &ocl_program,
         cl_device_id dev, cl_context ctx, const xpu::binary_t &binary);
 
-#ifndef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-status_t get_device_uuid(xpu::device_uuid_t &uuid, cl_device_id ocl_dev);
-#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
+inline status_t get_device_uuid(
+        xpu::device_uuid_t &uuid, cl_device_id ocl_dev) {
+    // This function is used only with SYCL that works with OpenCL 3.0
+    // that supports `cl_khr_device_uuid` extension.
+#if defined(cl_khr_device_uuid)
+    static_assert(
+            CL_UUID_SIZE_KHR == 16, "CL_UUID_SIZE_KHR is expected to be 16");
+
+    cl_uchar ocl_dev_uuid[CL_UUID_SIZE_KHR] = {};
+    OCL_CHECK(xpu::ocl::clGetDeviceInfo(ocl_dev, CL_DEVICE_UUID_KHR,
+            CL_UUID_SIZE_KHR, ocl_dev_uuid, nullptr));
+
+    uint64_t uuid_packed[CL_UUID_SIZE_KHR / sizeof(uint64_t)] = {};
+    for (size_t i = 0; i < CL_UUID_SIZE_KHR; ++i) {
+        size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
+        uuid_packed[i / sizeof(uint64_t)]
+                |= (((uint64_t)ocl_dev_uuid[i]) << shift);
+    }
+    uuid = xpu::device_uuid_t(uuid_packed[0], uuid_packed[1]);
+    return status::success;
+#endif
+    return status::runtime_error;
+}
 
 // Check for three conditions:
 // 1. Device and context are compatible, i.e. the device belongs to
@@ -430,9 +645,25 @@ cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags,
         size_t size, void *host_ptr, cl_int *errcode_ret);
 #endif
 
+inline status_t get_extensions(cl_device_id dev, std::string &ext) {
+    cl_int err = CL_SUCCESS;
+
+    size_t param_size = 0;
+    err = xpu::ocl::clGetDeviceInfo(
+            dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
+    OCL_CHECK(err);
+
+    ext.resize(param_size, '\0');
+    err = xpu::ocl::clGetDeviceInfo(
+            dev, CL_DEVICE_EXTENSIONS, param_size, &ext[0], &param_size);
+    OCL_CHECK(err);
+
+    return status::success;
+}
+
 } // namespace ocl
 } // namespace xpu
 } // namespace impl
 } // namespace dnnl
 
-#endif
+#endif // XPU_OCL_UTILS_HPP
diff --git a/src/xpu/sycl/utils.cpp b/src/xpu/sycl/utils.cpp
index 22d27e90691..fa4782d137f 100644
--- a/src/xpu/sycl/utils.cpp
+++ b/src/xpu/sycl/utils.cpp
@@ -24,7 +24,7 @@
 #include "common/engine.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/sycl/l0/utils.hpp"
+#include "gpu/intel/sycl/utils.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
diff --git a/third_party/ngen/ngen_level_zero.hpp b/third_party/ngen/ngen_level_zero.hpp
index 8ebb4a751c7..7f5f2efb8e6 100644
--- a/third_party/ngen/ngen_level_zero.hpp
+++ b/third_party/ngen/ngen_level_zero.hpp
@@ -92,12 +92,11 @@ class LevelZeroCodeGenerator : public ELFCodeGenerator<hw>
     explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {}
     LevelZeroCodeGenerator(LevelZeroCodeGenerator&&) = default;
 
-    inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
+    inline std::pair<ze_module_handle_t, ze_kernel_handle_t> getKernel(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
+    bool binaryIsZebin() { return true; }
+
     static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device);
     static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device);
-
-    static bool binaryIsZebin() { return true; }
-
 };
 
 #define NGEN_FORWARD_LEVEL_ZERO(hw) NGEN_FORWARD_ELF(hw)
@@ -140,7 +139,7 @@ static inline std::vector<uint8_t> getDummyModuleBinary(ze_context_handle_t cont
 }; /* namespace detail */
 
 template <HW hw>
-ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
+std::pair<ze_module_handle_t, ze_kernel_handle_t> LevelZeroCodeGenerator<hw>::getKernel(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
 {
     using super = ELFCodeGenerator<hw>;
 
@@ -162,7 +161,12 @@ ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t con
     if (module == nullptr)
         throw level_zero_error{};
 
-    return module;
+    auto kernelName = ELFCodeGenerator<hw>::interface_.getExternalName().c_str();
+    ze_kernel_handle_t kernelL0;
+    ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName};
+    detail::handleL0(dynamic::zeKernelCreate(module, &kernelDesc, &kernelL0));
+
+    return std::make_pair(module, kernelL0);
 }
 
 template <HW hw>