From ba12ecb18c53baa55bfeaa2b59cb77f49a169e83 Mon Sep 17 00:00:00 2001 From: "Palicki, Stefan" Date: Tue, 22 Jul 2025 08:34:47 -0700 Subject: [PATCH] gpu: intel: add Level Zero backend --- README.md | 5 +- cmake/OpenCL.cmake | 2 +- cmake/options.cmake | 2 +- doc/build/build_options.md | 2 +- examples/CMakeLists.txt | 5 +- examples/CMakeLists.txt.in | 2 +- examples/example_utils.h | 5 +- examples/example_utils.hpp | 10 +- include/dnnl_l0.h | 22 + include/dnnl_l0.hpp | 22 + include/oneapi/dnnl/dnnl_config.h.in | 9 +- include/oneapi/dnnl/dnnl_l0.h | 203 +++++++++ include/oneapi/dnnl/dnnl_l0.hpp | 260 +++++++++++ src/CMakeLists.txt | 2 +- src/common/c_types_map.hpp | 2 + src/common/engine.cpp | 22 +- src/common/engine.hpp | 2 + src/common/utils.cpp | 3 +- src/gpu/intel/CMakeLists.txt | 3 + src/gpu/intel/compute/ukernels.cpp | 26 +- src/gpu/intel/compute/utils.cpp | 119 +++++ src/gpu/intel/compute/utils.hpp | 14 +- .../intel/gemm/jit/dsl/ir/codegen/codegen.cpp | 51 +++ src/gpu/intel/gemm/jit/dsl/runtime.cpp | 14 + .../jit/include/gemmstone/dsl/runtime.hpp | 5 + .../gemm/jit/include/gemmstone/generator.hpp | 8 + .../gemm/jit/include/gemmstone/runtime.hpp | 5 + src/gpu/intel/jit/binary_format.cpp | 12 +- src/gpu/intel/jit/config/gemmstone_config.hpp | 3 + src/gpu/intel/jit/generator.hpp | 20 + src/gpu/intel/l0/CMakeLists.txt | 24 ++ src/gpu/intel/l0/capi/engine.cpp | 73 ++++ src/gpu/intel/l0/capi/memory.cpp | 128 ++++++ src/gpu/intel/l0/capi/primitive.cpp | 67 +++ src/gpu/intel/l0/capi/stream.cpp | 57 +++ src/gpu/intel/l0/compiler.hpp | 80 ++++ src/gpu/intel/l0/context.hpp | 103 +++++ src/gpu/intel/l0/device_info.cpp | 148 +++++++ src/gpu/intel/l0/device_info.hpp | 44 ++ src/gpu/intel/l0/engine.cpp | 325 ++++++++++++++ src/gpu/intel/l0/engine.hpp | 119 +++++ src/gpu/intel/l0/engine_factory.cpp | 88 ++++ src/gpu/intel/l0/engine_factory.hpp | 59 +++ src/gpu/intel/l0/kernel.cpp | 215 +++++++++ src/gpu/intel/l0/kernel.hpp | 76 ++++ src/gpu/intel/l0/memory_storage.cpp | 215 +++++++++ src/gpu/intel/l0/memory_storage.hpp | 91 ++++ src/gpu/intel/l0/stream.cpp | 233 ++++++++++ src/gpu/intel/l0/stream.hpp | 136 ++++++ src/gpu/intel/l0/stream_profiler.hpp | 140 ++++++ src/gpu/intel/l0/utils/CMakeLists.txt | 24 ++ src/gpu/intel/l0/utils/utils.cpp | 408 ++++++++++++++++++ src/gpu/intel/l0/utils/utils.hpp | 234 ++++++++++ src/gpu/intel/ocl/CMakeLists.txt | 2 - src/gpu/intel/ocl/device_info.cpp | 12 +- src/gpu/intel/ocl/device_info.hpp | 2 +- src/gpu/intel/ocl/engine.cpp | 33 +- src/gpu/intel/ocl/engine.hpp | 4 +- src/gpu/intel/ocl/hw_info.cpp | 2 +- src/gpu/intel/ocl/kernel.cpp | 2 +- src/gpu/intel/ocl/mdapi_utils.cpp | 2 +- src/gpu/intel/ocl/stream.cpp | 2 +- src/gpu/intel/ocl/stream.hpp | 4 +- src/gpu/intel/ocl/utils/CMakeLists.txt | 24 ++ src/gpu/intel/ocl/{ => utils}/utils.cpp | 66 +-- src/gpu/intel/ocl/{ => utils}/utils.hpp | 9 +- src/gpu/intel/sycl/compat.cpp | 2 +- src/gpu/intel/sycl/device_info.cpp | 8 +- src/gpu/intel/sycl/engine.cpp | 14 +- src/gpu/intel/sycl/engine.hpp | 2 +- src/gpu/intel/sycl/interop_kernel.cpp | 3 +- src/gpu/intel/sycl/l0/utils.cpp | 370 ---------------- src/gpu/intel/sycl/l0/utils.hpp | 60 --- src/gpu/intel/sycl/stream.cpp | 2 +- src/gpu/intel/sycl/stream.hpp | 2 + src/gpu/intel/sycl/utils.cpp | 109 +++-- src/gpu/intel/sycl/utils.hpp | 10 + src/graph/backend/dnnl/scratchpad.hpp | 2 - src/graph/utils/ocl_check.hpp | 2 +- src/xpu/ocl/buffer_memory_storage.hpp | 2 +- src/xpu/ocl/utils.cpp | 229 +--------- src/xpu/ocl/utils.hpp | 251 ++++++++++- src/xpu/sycl/utils.cpp | 2 +- third_party/ngen/ngen_level_zero.hpp | 16 +- 84 files changed, 4318 insertions(+), 879 deletions(-) create mode 100644 include/dnnl_l0.h create mode 100644 include/dnnl_l0.hpp create mode 100644 include/oneapi/dnnl/dnnl_l0.h create mode 100644 include/oneapi/dnnl/dnnl_l0.hpp create mode 100644 src/gpu/intel/compute/utils.cpp create mode 100644 src/gpu/intel/l0/CMakeLists.txt create mode 100644 src/gpu/intel/l0/capi/engine.cpp create mode 100644 src/gpu/intel/l0/capi/memory.cpp create mode 100644 src/gpu/intel/l0/capi/primitive.cpp create mode 100644 src/gpu/intel/l0/capi/stream.cpp create mode 100644 src/gpu/intel/l0/compiler.hpp create mode 100644 src/gpu/intel/l0/context.hpp create mode 100644 src/gpu/intel/l0/device_info.cpp create mode 100644 src/gpu/intel/l0/device_info.hpp create mode 100644 src/gpu/intel/l0/engine.cpp create mode 100644 src/gpu/intel/l0/engine.hpp create mode 100644 src/gpu/intel/l0/engine_factory.cpp create mode 100644 src/gpu/intel/l0/engine_factory.hpp create mode 100644 src/gpu/intel/l0/kernel.cpp create mode 100644 src/gpu/intel/l0/kernel.hpp create mode 100644 src/gpu/intel/l0/memory_storage.cpp create mode 100644 src/gpu/intel/l0/memory_storage.hpp create mode 100644 src/gpu/intel/l0/stream.cpp create mode 100644 src/gpu/intel/l0/stream.hpp create mode 100644 src/gpu/intel/l0/stream_profiler.hpp create mode 100644 src/gpu/intel/l0/utils/CMakeLists.txt create mode 100644 src/gpu/intel/l0/utils/utils.cpp create mode 100644 src/gpu/intel/l0/utils/utils.hpp create mode 100644 src/gpu/intel/ocl/utils/CMakeLists.txt rename src/gpu/intel/ocl/{ => utils}/utils.cpp (82%) rename src/gpu/intel/ocl/{ => utils}/utils.hpp (92%) delete mode 100644 src/gpu/intel/sycl/l0/utils.cpp delete mode 100644 src/gpu/intel/sycl/l0/utils.hpp diff --git a/README.md b/README.md index a00612e7097..a2cce3fe2c2 100644 --- a/README.md +++ b/README.md @@ -279,9 +279,9 @@ Runtime-specific dependencies: | `ONEDNN_CPU_RUNTIME=OMP` | Intel C/C++ Compiler | Intel OpenMP runtime (`libiomp5.so`) | `ONEDNN_CPU_RUNTIME=OMP` | Clang | Intel OpenMP runtime (`libiomp5.so`) | `ONEDNN_CPU_RUNTIME=TBB` | any | TBB (`libtbb.so`) -| `ONEDNN_CPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), TBB (`libtbb.so`), OpenCL loader (`libOpenCL.so`) +| `ONEDNN_CPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), TBB (`libtbb.so`), OpenCL loader (`libOpenCL.so`), oneAPI Level Zero loader (`libze_loader.so`) | `ONEDNN_GPU_RUNTIME=OCL` | any | OpenCL loader (`libOpenCL.so`) -| `ONEDNN_GPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), OpenCL loader (`libOpenCL.so`), oneAPI Level Zero loader (`libze_loader.so`) +| `ONEDNN_GPU_RUNTIME=L0` | any | oneAPI Level Zero loader (`libze_loader.so`) #### Windows @@ -298,6 +298,7 @@ Runtime-specific dependencies: | `ONEDNN_CPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), TBB (`tbb.dll`), OpenCL loader (`OpenCL.dll`) | `ONEDNN_GPU_RUNTIME=OCL` | any | OpenCL loader (`OpenCL.dll`) | `ONEDNN_GPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), OpenCL loader (`OpenCL.dll`), oneAPI Level Zero loader (`ze_loader.dll`) +| `ONEDNN_GPU_RUNTIME=L0` | any | oneAPI Level Zero loader (`ze_loader.dll`) #### macOS diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake index 3ab84a1bf03..23bcec0254d 100644 --- a/cmake/OpenCL.cmake +++ b/cmake/OpenCL.cmake @@ -25,7 +25,7 @@ set(OpenCL_cmake_included true) if(DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL") add_definitions_with_host_compiler(-DCL_TARGET_OPENCL_VERSION=300) else() - add_definitions(-DCL_TARGET_OPENCL_VERSION=120) + add_definitions(-DCL_TARGET_OPENCL_VERSION=300) endif() if(OpenCL_INCLUDE_DIR) diff --git a/cmake/options.cmake b/cmake/options.cmake index a707c670e2a..bc5c36ca450 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -287,7 +287,7 @@ set(DNNL_GPU_RUNTIME "NONE" CACHE STRING Using OpenCL for GPU requires setting OPENCLROOT if the libraries are installed in a non-standard location.") -if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$") +if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL|L0)$") message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}") endif() diff --git a/doc/build/build_options.md b/doc/build/build_options.md index fabe37c55e6..a29adbe5c4f 100644 --- a/doc/build/build_options.md +++ b/doc/build/build_options.md @@ -7,7 +7,7 @@ oneDNN supports the following build-time options. |:--------------------------------|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------| | ONEDNN_LIBRARY_TYPE | **SHARED**, STATIC | Defines the resulting library type | | ONEDNN_CPU_RUNTIME | NONE, **OMP**, TBB, SEQ, THREADPOOL, SYCL | Defines the threading runtime for CPU engines | -| ONEDNN_GPU_RUNTIME | **NONE**, OCL, SYCL | Defines the offload runtime for GPU engines | +| ONEDNN_GPU_RUNTIME | **NONE**, OCL, SYCL, L0 | Defines the offload runtime for GPU engines | | ONEDNN_BUILD_DOC | **ON**, OFF | Controls building the documentation | | ONEDNN_DOC_VERSIONS_JSON | **""**, *string* | Location of JSON file for [PyData Sphinx Theme version switcher]. Enables documentation version switcher when set. | | ONEDNN_BUILD_EXAMPLES | **ON**, OFF | Controls building the examples | diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ddb7f745177..b0b6728ef73 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -147,6 +147,9 @@ foreach(f ${sources}) if(NOT DNNL_WITH_SYCL AND ${f_name} MATCHES "^sycl") list(REMOVE_ITEM sources "${f}") endif() + if(NOT DNNL_GPU_RUNTIME STREQUAL "L0" AND ${f_name} MATCHES ".*level_zero") + list(REMOVE_ITEM sources "${f}") + endif() endforeach() # In case of SYCL, skip CPU examples that directly work with raw pointers @@ -202,7 +205,7 @@ foreach(src ${sources}) endif() else() set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)") - set(gpu_rt_pattern "(OCL|SYCL|DPCPP)") + set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)") if(${example_name} MATCHES "sycl.*") set(cpu_rt_pattern "(SYCL|DPCPP)") set(gpu_rt_pattern "(SYCL|DPCPP)") diff --git a/examples/CMakeLists.txt.in b/examples/CMakeLists.txt.in index 0eec7cdaf41..a5af8163729 100644 --- a/examples/CMakeLists.txt.in +++ b/examples/CMakeLists.txt.in @@ -214,7 +214,7 @@ foreach(src ${sources}) endif() else() set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)") - set(gpu_rt_pattern "(OCL|SYCL|DPCPP)") + set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)") if(${example_name} MATCHES "sycl.*") set(cpu_rt_pattern "(SYCL|DPCPP)") set(gpu_rt_pattern "(SYCL|DPCPP)") diff --git a/examples/example_utils.h b/examples/example_utils.h index eefd64025bb..c445cdeff68 100644 --- a/examples/example_utils.h +++ b/examples/example_utils.h @@ -29,6 +29,9 @@ #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #include "dnnl_ocl.h" #endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#include "dnnl_l0.h" +#endif #define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \ do { \ @@ -160,7 +163,7 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) { } #endif -#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 if (eng_kind == dnnl_gpu) { void *mapped_ptr = NULL; CHECK(dnnl_memory_map_data(mem, &mapped_ptr)); diff --git a/examples/example_utils.hpp b/examples/example_utils.hpp index 07a7951f912..8791fd17926 100644 --- a/examples/example_utils.hpp +++ b/examples/example_utils.hpp @@ -35,7 +35,11 @@ #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #include "dnnl_ocl.hpp" -#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#include "dnnl_l0.hpp" +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL #include "dnnl_sycl.hpp" #endif @@ -228,7 +232,7 @@ inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) { return; } #endif -#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 if (eng.get_kind() == dnnl::engine::kind::gpu) { void *mapped_ptr = mem.map_data(); if (mapped_ptr) std::memcpy(handle, mapped_ptr, size); @@ -287,7 +291,7 @@ inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) { return; } #endif -#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 if (eng.get_kind() == dnnl::engine::kind::gpu) { void *mapped_ptr = mem.map_data(); if (mapped_ptr) std::memcpy(mapped_ptr, handle, size); diff --git a/include/dnnl_l0.h b/include/dnnl_l0.h new file mode 100644 index 00000000000..21ac50517c0 --- /dev/null +++ b/include/dnnl_l0.h @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_L0_H +#define DNNL_L0_H + +#include "oneapi/dnnl/dnnl_l0.h" + +#endif /* DNNL_L0_H */ diff --git a/include/dnnl_l0.hpp b/include/dnnl_l0.hpp new file mode 100644 index 00000000000..db2d3137b03 --- /dev/null +++ b/include/dnnl_l0.hpp @@ -0,0 +1,22 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef DNNL_L0_HPP +#define DNNL_L0_HPP + +#include "oneapi/dnnl/dnnl_l0.hpp" + +#endif /* DNNL_L0_HPP */ diff --git a/include/oneapi/dnnl/dnnl_config.h.in b/include/oneapi/dnnl/dnnl_config.h.in index 11e1b6d98ad..29e3246b52d 100644 --- a/include/oneapi/dnnl/dnnl_config.h.in +++ b/include/oneapi/dnnl/dnnl_config.h.in @@ -82,6 +82,9 @@ /// DPC++ runtime #define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL +/// L0 runtime +#define DNNL_RUNTIME_L0 1024u + /// No vendor (corresponding runtime is disabled) #define DNNL_VENDOR_NONE 0u @@ -119,7 +122,8 @@ #endif #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \ && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \ - && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) + && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) \ + && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_L0) #error "Unexpected DNNL_GPU_RUNTIME" #endif #if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \ @@ -145,9 +149,6 @@ // When defined, DPCPP is supported. #cmakedefine DNNL_WITH_SYCL -// When defined, Level Zero is supported. -#cmakedefine DNNL_WITH_LEVEL_ZERO - // When defined, SYCL CUDA backend is used. #cmakedefine DNNL_SYCL_CUDA diff --git a/include/oneapi/dnnl/dnnl_l0.h b/include/oneapi/dnnl/dnnl_l0.h new file mode 100644 index 00000000000..543453a87e6 --- /dev/null +++ b/include/oneapi/dnnl/dnnl_l0.h @@ -0,0 +1,203 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_L0_H +#define ONEAPI_DNNL_DNNL_L0_H + +#include "oneapi/dnnl/dnnl.h" + +/// @cond DO_NOT_DOCUMENT_THIS +#include "level_zero/ze_api.h" +/// @endcond + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_l0_interop +/// @{ + +/// Creates an engine associated with a Level Zero device and a Level Zero context. +/// +/// @param engine Output engine. +/// @param driver Pointer to the Level Zero driver to use for the engine. +/// @param device Pointer to the Level Zero device to use for the engine. +/// @param context Pointer to the Level Zero context to use for the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_engine_create(dnnl_engine_t *engine, + const ze_driver_handle_t adriver, const ze_device_handle_t adevice, + const ze_context_handle_t acontext); + +/// Returns the Level Zero context associated with an engine. +/// +/// @param engine Engine to query. +/// @param context Pointer to the underlying Level Zero context of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_context( + dnnl_engine_t engine, ze_context_handle_t context); + +/// Returns the Level Zero device associated with an engine. +/// +/// @param engine Engine to query. +/// @param device Pointer to the underlying Level Zero device of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_device( + dnnl_engine_t engine, ze_device_handle_t device); + +/// Returns the Level Zero driver associated with an engine. +/// +/// @param engine Engine to query. +/// @param device Pointer to the underlying Level Zero driver of the engine. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_driver( + dnnl_engine_t engine, ze_driver_handle_t driver); + +/// Creates an execution stream for a given engine associated with a Level Zero +/// command list. +/// +/// @param stream Output execution stream. +/// @param engine Engine to create the execution stream on. +/// @param list Level Zero command list to use. +/// @param profiling Flag enabling GPU kernel profiling. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_stream_create(dnnl_stream_t *stream, + dnnl_engine_t engine, ze_command_list_handle_t list, bool profiling); + +/// Returns the Level Zero command list associated with an execution stream. +/// +/// @param stream Execution stream to query. +/// @param list Output Level Zero command list. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_stream_get_list( + dnnl_stream_t stream, ze_command_list_handle_t list); + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal +/// to dnnl_l0_interop_usm, or +/// - dnnl_l0_interop_memory_set_buffer() has been called, if @p memory_kind +/// is equal to dnnl_l0_interop_buffer. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::l0_interop::memory_kind::usm. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + void *handle); + +/// Creates a memory object with multiple handles. +/// +/// @param memory Output memory object. +/// @param memory_desc Memory descriptor. +/// @param engine Engine to use. +/// @param memory_kind Memory allocation kind to specify the type of handles. +/// @param nhandles Number of handles. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::l0_interop::memory_kind::usm. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + int nhandles, void **handles); + +/// Returns an Level Zero memory object associated with a memory object. +/// +/// @param memory Memory object. +/// @param mem_object Output Level Zero memory object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object( + const_dnnl_memory_t memory, void **mem_object); + +/// Sets Level Zero memory object associated with a memory object. +/// +/// For behavioral details, see dnnl_memory_set_data_handle(). +/// +/// @param memory Memory object. +/// @param mem_object Level Zero memory object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object( + dnnl_memory_t memory, void *mem_object); + +/// Executes computations specified by the primitive in a specified stream and +/// returns a Level Zero event. +/// +/// @param primitive Primitive to execute. +/// @param stream Stream to use. +/// @param nargs Number of arguments. +/// @param args Array of arguments. Each argument is an +/// pair. The index is one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see +/// #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory +/// descriptor as that returned by +/// #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index). +/// @param ndeps Number of dependencies. +/// @param deps A pointer to a vector of size @p ndeps that contains +/// dependencies. +/// @param return_event Output event. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_l0_interop_primitive_execute( + const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs, + const dnnl_exec_arg_t *args, int ndeps, const ze_event_handle_t *deps, + ze_event_handle_t *return_event); + +/// @} dnnl_api_l0_interop + +/// @} dnnl_api_interop + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // ONEAPI_DNNL_DNNL_L0_H diff --git a/include/oneapi/dnnl/dnnl_l0.hpp b/include/oneapi/dnnl/dnnl_l0.hpp new file mode 100644 index 00000000000..00762b4af03 --- /dev/null +++ b/include/oneapi/dnnl/dnnl_l0.hpp @@ -0,0 +1,260 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ONEAPI_DNNL_DNNL_L0_HPP +#define ONEAPI_DNNL_DNNL_L0_HPP + +#include "oneapi/dnnl/dnnl.hpp" + +/// @cond DO_NOT_DOCUMENT_THIS +#include +#include + +#include "oneapi/dnnl/dnnl_l0.h" +/// @endcond + +/// @addtogroup dnnl_api +/// @{ + +namespace dnnl { + +/// @addtogroup dnnl_api_interop +/// @{ + +/// @addtogroup dnnl_api_l0_interop Level Zero interoperability API +/// API extensions to interact with the underlying Level Zero run-time. +/// +/// @sa @ref dev_guide_dpcpp_interoperability in developer guide +/// @{ + +/// Level Zero interoperability namespace +namespace l0_interop { + +/// Constructs an engine from Level Zero device and context objects. +/// +/// @param adriver Level Zero driver. +/// @param adevice Level Zero device. +/// @param acontext Level Zero context. +/// +/// @returns Created engine. +inline engine make_engine(const ze_driver_handle_t adriver, + const ze_device_handle_t adevice, const ze_context_handle_t acontext) { + dnnl_engine_t aengine; + error::wrap_c_api( + dnnl_l0_interop_engine_create(&aengine, adriver, adevice, acontext), + "could not create an engine"); + return engine(aengine); +} + +/// Returns the Level Zero context associated with an engine. +/// +/// @param aengine Engine to query. +/// +/// @returns The underlying Level Zero device of the engine. +inline ze_context_handle_t get_context(const engine &aengine) { + ze_context_handle_t ctx = nullptr; + error::wrap_c_api(dnnl_l0_interop_engine_get_context(aengine.get(), ctx), + "could not get a context handle"); + return ctx; +} + +/// Returns the Level Zero device associated with an engine. +/// +/// @param aengine Engine to query. +/// +/// @returns The underlying Level Zero context of the engine. +inline ze_device_handle_t get_device(const engine &aengine) { + ze_device_handle_t dev = nullptr; + error::wrap_c_api(dnnl_l0_interop_engine_get_device(aengine.get(), dev), + "could not get a device handle"); + return dev; +} + +/// Returns the Level Zero driver associated with an engine. +/// +/// @param aengine Engine to query. +/// +/// @returns The underlying Level Zero driver of the engine. +inline ze_driver_handle_t get_driver(const engine &aengine) { + ze_driver_handle_t dri = nullptr; + error::wrap_c_api(dnnl_l0_interop_engine_get_driver(aengine.get(), dri), + "could not get a driver handle"); + return dri; +} + +/// Creates an execution stream for a given engine associated with a Level Zero +/// command list. +/// +/// @param aengine Engine object to use for the stream. +/// @param alist Level Zero immediate command list to use for the stream. +/// @param aprofiling Flag enabling GPU kernel profiling. +/// +/// @returns An execution stream. +inline stream make_stream(const engine &aengine, ze_command_list_handle_t alist, + bool aprofiling = false) { + dnnl_stream_t astream; + error::wrap_c_api(dnnl_l0_interop_stream_create( + &astream, aengine.get(), alist, aprofiling), + "could not create a stream"); + return stream(astream); +} + +/// Returns the Level Zero immediate command list associated with an execution stream. +/// +/// @param astream Execution stream to query. +/// +/// @returns Level Zero immediate command list object. +inline ze_command_list_handle_t get_list(const stream &astream) { + ze_command_list_handle_t list = nullptr; + error::wrap_c_api(dnnl_l0_interop_stream_get_list(astream.get(), list), + "could not get a stream handle"); + return list; +} + +/// Creates a memory object with multiple handles. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param handles Handles of the memory buffers to use as underlying storages. +/// For each element of the @p handles array the following applies: +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::l0_interop::memory_kind::usm. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// If the @p handles vector is not provided the library will allocate all +/// buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE. +/// +/// @returns Created memory object. +inline memory make_memory(const memory::desc &memory_desc, + const engine &aengine, std::vector handles = {}) { + if (handles.empty()) { + const int nhandles = memory_desc.get_num_handles(); + handles.resize(nhandles, DNNL_MEMORY_ALLOCATE); + } + + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_l0_interop_memory_create_v2(&c_memory, memory_desc.get(), + aengine.get(), handles.size(), handles.data()), + "could not create a memory"); + return memory(c_memory); +} + +/// Creates a memory object. +/// +/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the +/// constructed memory object will have the underlying buffer set. In this +/// case, the buffer will be initialized as if: +/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is +/// equal to dnnl::l0_interop::memory_kind::usm, or +/// - dnnl::l0_interop::set_buffer() has been called, if @p memory_kind is +/// equal to dnnl::l0_interop::memory_kind::buffer. +/// +/// @param memory_desc Memory descriptor. +/// @param aengine Engine to use. +/// @param handle Handle of the memory buffer to use as an underlying storage. +/// - A USM pointer to the user-allocated buffer. In this case the library +/// doesn't own the buffer. Requires @p memory_kind to be equal to +/// dnnl::l0_interop::memory_kind::usm. +/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to +/// allocate the buffer for the memory object. In this case the library +/// owns the buffer. +/// - The DNNL_MEMORY_NONE specific value. Instructs the library to +/// create memory object without an underlying buffer. +/// +/// @returns Created memory object. +inline memory make_memory( + const memory::desc &memory_desc, const engine &aengine, void *handle) { + return make_memory(memory_desc, aengine, std::vector {handle}); +} + +/// Returns the Level Zero memory object associated with the memory object. +/// +/// @param amemory A memory object. +/// @returns Underlying Level Zero memory object. +inline void *get_mem_object(const memory &amemory) { + void *mem_object; + error::wrap_c_api( + dnnl_l0_interop_memory_get_mem_object(amemory.get(), &mem_object), + "could not get Level Zero buffer object from a memory object"); + return mem_object; +} + +/// Sets the Level Zero memory object associated with the memory object. +/// +/// For behavioral details see memory::set_data_handle(). +/// +/// @param amemory A memory object. +/// @param mem_object Level Zero cl_mem object to use as the underlying +/// storage. It must have at least get_desc().get_size() bytes +/// allocated. +inline void set_mem_object(memory &amemory, void *mem_object) { + error::wrap_c_api( + dnnl_l0_interop_memory_set_mem_object(amemory.get(), mem_object), + "could not set Level Zero buffer object from a memory object"); +} + +/// Executes computations specified by the primitive in a specified stream and +/// returns a Level Zero event. +/// +/// Arguments are passed via an arguments map containing +/// pairs. The index must be one of the `DNNL_ARG_*` +/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor +/// matching the one returned by +/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using +/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL). +/// +/// @param aprimitive Primitive to execute. +/// @param astream Stream object. The stream must belong to the same engine +/// as the primitive. +/// @param args Arguments map. +/// @param deps Optional vector with `ze_event_handle_t` dependencies. +/// +/// @returns Output event. +inline ze_event_handle_t execute(const dnnl::primitive &aprimitive, + const stream &astream, const std::unordered_map &args, + const std::vector &deps = {}) { + std::vector c_args; + c_args.reserve(args.size()); + for (const auto &a : args) + c_args.push_back({a.first, a.second.get()}); + + const ze_event_handle_t *c_deps = deps.empty() ? nullptr : deps.data(); + + ze_event_handle_t return_event; + error::wrap_c_api(dnnl_l0_interop_primitive_execute(aprimitive.get(), + astream.get(), c_args.size(), c_args.data(), + deps.size(), c_deps, &return_event), + "could not execute a primitive"); + return return_event; +} + +} // namespace l0_interop + +/// @} dnnl_api_l0_interop + +/// @} dnnl_api_interop + +} // namespace dnnl + +/// @} dnnl_api + +#endif // ONEAPI_DNNL_DNNL_L0_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 54b3627675c..e392c435c05 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -135,7 +135,7 @@ if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE") add_subdirectory(gpu) endif() -if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL") +if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME MATCHES "OCL") add_subdirectory(xpu) endif() diff --git a/src/common/c_types_map.hpp b/src/common/c_types_map.hpp index 0d48d18d946..f29fe78d7e2 100644 --- a/src/common/c_types_map.hpp +++ b/src/common/c_types_map.hpp @@ -1969,6 +1969,7 @@ enum runtime_kind_t { dnnl_runtime_threadpool, dnnl_runtime_ocl, dnnl_runtime_sycl, + dnnl_runtime_l0, }; namespace runtime_kind { @@ -1979,6 +1980,7 @@ const runtime_kind_t tbb = dnnl_runtime_tbb; const runtime_kind_t threadpool = dnnl_runtime_threadpool; const runtime_kind_t ocl = dnnl_runtime_ocl; const runtime_kind_t sycl = dnnl_runtime_sycl; +const runtime_kind_t l0 = dnnl_runtime_l0; } // namespace runtime_kind using primitive_kind_t = dnnl_primitive_kind_t; diff --git a/src/common/engine.cpp b/src/common/engine.cpp index a2f5edae841..647a2b34f74 100644 --- a/src/common/engine.cpp +++ b/src/common/engine.cpp @@ -29,12 +29,16 @@ #include "cpu/cpu_engine.hpp" #endif +#ifdef DNNL_WITH_SYCL +#include "xpu/sycl/engine_factory.hpp" +#endif + #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #include "xpu/ocl/engine_factory.hpp" #endif -#ifdef DNNL_WITH_SYCL -#include "xpu/sycl/engine_factory.hpp" +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#include "gpu/intel/l0/engine_factory.hpp" #endif namespace dnnl { @@ -42,23 +46,27 @@ namespace impl { static inline std::unique_ptr get_engine_factory( engine_kind_t kind, runtime_kind_t runtime_kind) { - #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE if (kind == engine_kind::cpu && is_native_runtime(runtime_kind)) { return std::unique_ptr( new cpu::cpu_engine_factory_t()); } #endif - +#ifdef DNNL_WITH_SYCL + if (runtime_kind == runtime_kind::sycl) { + return xpu::sycl::get_engine_factory(kind); + } +#endif #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL if (kind == engine_kind::gpu && runtime_kind == runtime_kind::ocl) { return std::unique_ptr( new xpu::ocl::engine_factory_t(kind)); } #endif -#ifdef DNNL_WITH_SYCL - if (runtime_kind == runtime_kind::sycl) - return xpu::sycl::get_engine_factory(kind); +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 + if (kind == engine_kind::gpu && runtime_kind == runtime_kind::l0) { + return gpu::intel::l0::get_engine_factory(kind); + } #endif return nullptr; } diff --git a/src/common/engine.hpp b/src/common/engine.hpp index 3fa59ce22b2..330df8d5370 100644 --- a/src/common/engine.hpp +++ b/src/common/engine.hpp @@ -189,6 +189,8 @@ inline runtime_kind_t get_default_runtime(engine_kind_t kind) { if (kind == engine_kind::gpu) return runtime_kind::ocl; #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL if (kind == engine_kind::gpu) return runtime_kind::sycl; +#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 + if (kind == engine_kind::gpu) return runtime_kind::l0; #endif #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SEQ return runtime_kind::seq; diff --git a/src/common/utils.cpp b/src/common/utils.cpp index 3d5a3d2e25f..aaab751b31d 100644 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -291,7 +291,8 @@ std::string get_jit_profiling_jitdumpdir() { bool is_destroying_cache_safe() { #if defined(_WIN32) \ - && (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL) + && (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 \ + || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL) // The ntdll.dll library is located in system32, therefore setting // additional environment is not required. HMODULE handle = LoadLibraryExA( diff --git a/src/gpu/intel/CMakeLists.txt b/src/gpu/intel/CMakeLists.txt index 61ae11dfcf2..52feedc6b67 100644 --- a/src/gpu/intel/CMakeLists.txt +++ b/src/gpu/intel/CMakeLists.txt @@ -58,9 +58,12 @@ add_subdirectory(jit) if(DNNL_GPU_RUNTIME STREQUAL "OCL") add_subdirectory(ocl) +elseif(DNNL_GPU_RUNTIME STREQUAL "L0") + add_subdirectory(l0) elseif(DNNL_WITH_SYCL) add_subdirectory(sycl) add_subdirectory(ocl) + add_subdirectory(l0/utils) endif() set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel) diff --git a/src/gpu/intel/compute/ukernels.cpp b/src/gpu/intel/compute/ukernels.cpp index ae0947c5752..98f03b3d6d1 100644 --- a/src/gpu/intel/compute/ukernels.cpp +++ b/src/gpu/intel/compute/ukernels.cpp @@ -16,14 +16,19 @@ #include "gpu/intel/compute/ukernels.hpp" +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL +#include "gpu/intel/sycl/engine.hpp" +#include "gpu/intel/sycl/utils.hpp" +#endif + #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #include "gpu/intel/ocl/engine.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #endif -#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL -#include "gpu/intel/sycl/engine.hpp" -#include "gpu/intel/sycl/utils.hpp" +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/utils/utils.hpp" #endif namespace dnnl { @@ -51,6 +56,11 @@ bool mayiuse_microkernels(const engine_t *engine) { auto mayiuse_mk = [](const engine_t *engine) { switch (engine->runtime_kind()) { +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL + case runtime_kind::sycl: + return sycl::mayiuse_microkernels( + utils::downcast(engine)); +#endif #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL case runtime_kind::ocl: { auto *ocl_engine @@ -60,10 +70,10 @@ bool mayiuse_microkernels(const engine_t *engine) { cl_microkernels_check_kernel_code); } #endif -#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL - case runtime_kind::sycl: - return sycl::mayiuse_microkernels( - utils::downcast(engine)); +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 + case runtime_kind::l0: + return utils::downcast(engine) + ->mayiuse_microkernels(); #endif default: return false; } diff --git a/src/gpu/intel/compute/utils.cpp b/src/gpu/intel/compute/utils.cpp new file mode 100644 index 00000000000..27fddb7154e --- /dev/null +++ b/src/gpu/intel/compute/utils.cpp @@ -0,0 +1,119 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/compute/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace compute { + +status_t preprocess_headers(stringstream_t &pp_code, const char *code, + const compute::kernel_ctx_t &kernel_ctx) { + stringstream_t code_stream(code); + + for (std::string line; std::getline(code_stream, line);) { + const size_t include_pos = line.find("#include"); + if (include_pos != std::string::npos) { + static constexpr size_t include_len = 8; + const size_t first_quote_pos + = line.find("\"", include_pos + include_len); + const size_t second_quote_pos + = line.find("\"", first_quote_pos + 1); + const size_t kernel_name_len + = second_quote_pos - first_quote_pos - 1; + const auto header_name + = line.substr(first_quote_pos + 1, kernel_name_len); + const char *header_source + = kernel_ctx.get_custom_header(header_name); + if (!header_source) header_source = get_kernel_header(header_name); + CHECK(preprocess_headers(pp_code, header_source, kernel_ctx)); + } else { + pp_code << line << std::endl; + } + } + return status::success; +} + +void debugdump_processed_source(const std::string &source, + const std::string &options, const std::string &cl_options) { +#if defined(__linux__) && defined(DNNL_DEV_MODE) + if (get_verbose(verbose_t::debuginfo) >= 10) { + auto get_defines = [](const std::string &from) { + std::string ret; + size_t pos = 0; + while (pos < from.length()) { + // Find next define argument + pos = from.find("-D", pos); + + // Generate argument, quotes are interpreted literally, but + // other special shell characters need escaped. Does not + // currently handle quotes with the ' character or nested quotes + char quote_parity = true; + while (pos < from.length()) { + if (quote_parity + && utils::one_of(from[pos], '~', '#', '$', '&', '*', + '(', ')', '\\', '|', '[', ']', '{', '}', + ';', '\'', '<', '>', '/', '?', '!')) { + ret += '\\'; + } + ret += from[pos]; + if (from[pos] == '"') quote_parity ^= true; + if (from[pos] == ' ' && quote_parity) break; + + pos++; + } + } + return ret; + }; + auto execute_command + = [](const std::string &cmd, const std::string &stdin) { + std::string result; + std::array buffer; + FILE *pipe = popen(cmd.c_str(), "w"); + fputs(stdin.c_str(), pipe); + if (pipe) { + while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { + result += buffer.data(); + } + } + pclose(pipe); + return result; + }; + + // Run utilities to evaluate preprocessor defines and format the file + // Theoretically, we can accomplish this task with libclang, but it + // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG + // so that calls to the system are not included in the default build. + + // Due to the use of a different C preprocessor, warnings should not be + // ignored, as they may correspond to a different behavior in the OpenCL + // C preprocessor + auto o = get_defines(options) + get_defines(cl_options); + std::string preprocess_cmd + = std::string() + "cpp -P " + o + " | clang-format"; + execute_command(preprocess_cmd, source); + std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl; + } +#endif +} + +} // namespace compute +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/compute/utils.hpp b/src/gpu/intel/compute/utils.hpp index 564308ad0b3..dc29eb20985 100644 --- a/src/gpu/intel/compute/utils.hpp +++ b/src/gpu/intel/compute/utils.hpp @@ -17,14 +17,8 @@ #ifndef GPU_INTEL_COMPUTE_UTILS_HPP #define GPU_INTEL_COMPUTE_UTILS_HPP -#include -#include -#include -#include -#include - #include "common/utils.hpp" -#include "gpu/intel/compute/device_info.hpp" +#include "gpu/intel/compute/kernel_ctx.hpp" #include "gpu/intel/utils.hpp" namespace dnnl { @@ -155,6 +149,12 @@ class nd_range_t { range_t local_range_; }; +status_t preprocess_headers(stringstream_t &pp_code, const char *code, + const compute::kernel_ctx_t &kernel_ctx); + +void debugdump_processed_source(const std::string &source, + const std::string &options, const std::string &ocl_options); + } // namespace compute } // namespace intel } // namespace gpu diff --git a/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp b/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp index f1e90255b48..bfca1e9e4fa 100644 --- a/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp +++ b/src/gpu/intel/gemm/jit/dsl/ir/codegen/codegen.cpp @@ -40,6 +40,9 @@ #ifdef GEMMSTONE_WITH_OPENCL_RUNTIME #include "ngen_opencl.hpp" #endif +#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME +#include "ngen_level_zero.hpp" +#endif GEMMSTONE_NAMESPACE_START namespace dsl { @@ -1862,6 +1865,54 @@ cl_kernel make_kernel( return {}; } #endif +#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME +template +using l0_gen_t = ir::ir_to_ngen_generator_t>; +GEMMSTONE_XELP_ISA( + template void ir::convert_ir_to_ngen>( + const stmt_t &body, l0_gen_t &host, + const walk_order_t *kernel_grid_walk_order)); +GEMMSTONE_XEHP_ISA( + template void ir::convert_ir_to_ngen>( + const stmt_t &body, l0_gen_t &host, + const walk_order_t *kernel_grid_walk_order)); +GEMMSTONE_XEHPG_ISA( + template void ir::convert_ir_to_ngen>( + const stmt_t &body, l0_gen_t &host, + const walk_order_t *kernel_grid_walk_order)); +GEMMSTONE_XEHPC_ISA( + template void ir::convert_ir_to_ngen>( + const stmt_t &body, l0_gen_t &host, + const walk_order_t *kernel_grid_walk_order)); +GEMMSTONE_XE2_ISA(template void ir::convert_ir_to_ngen>( + const stmt_t &body, l0_gen_t &host, + const walk_order_t *kernel_grid_walk_order)); +GEMMSTONE_XE3_ISA(template void ir::convert_ir_to_ngen>( + const stmt_t &body, l0_gen_t &host, + const walk_order_t *kernel_grid_walk_order)); + +std::pair make_kernel( + const kernel_t &ir_kernel, ze_context_handle_t ctx, + ze_device_handle_t dev) { + auto &iface = ir_kernel.iface; + auto &options = ir_kernel.options; + auto &body = ir_kernel.body; + auto &debug_cfg = ir_kernel.debug_cfg; + + ngen::NEOInterfaceHandler interface = generate_ngen_interface( + iface, options, body); + +#define GPU_HW_CASE(hw) \ + l0_gen_t<(hw)> g(iface, options, debug_cfg); \ + g.setInterface(std::move(interface)); \ + convert_ir_to_ngen(body, g); \ + return g.getKernel(ctx, dev); + + GPU_HW_SWITCH(options.hw().ngen_hw()); +#undef GPU_HW_CASE + return {}; +} +#endif } // namespace dsl GEMMSTONE_NAMESPACE_END diff --git a/src/gpu/intel/gemm/jit/dsl/runtime.cpp b/src/gpu/intel/gemm/jit/dsl/runtime.cpp index cc1f5c35587..3fdc564b285 100644 --- a/src/gpu/intel/gemm/jit/dsl/runtime.cpp +++ b/src/gpu/intel/gemm/jit/dsl/runtime.cpp @@ -154,4 +154,18 @@ cl_kernel make_kernel( } #endif +#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME +std::pair make_kernel( + const GEMMKernelDesc &desc, ze_context_handle_t context, + ze_device_handle_t device) { + if (desc.strategy.isDSLGenerator) { + generator_dsl_desc_t dsl_desc( + desc.problem, desc.strategy, desc.iface, desc.options); + auto dsl_kernel = make_kernel(dsl_desc); + return dsl::make_kernel(dsl_kernel, context, device); + } + stub(); +} +#endif + GEMMSTONE_NAMESPACE_END diff --git a/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp b/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp index ff70527a6d8..7ba9db8e24a 100644 --- a/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp +++ b/src/gpu/intel/gemm/jit/include/gemmstone/dsl/runtime.hpp @@ -48,6 +48,11 @@ ::sycl::kernel make_kernel( #ifdef GEMMSTONE_WITH_OPENCL_RUNTIME cl_kernel make_kernel(const kernel_t &kernel, cl_context ctx, cl_device_id dev); #endif +#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME +std::pair make_kernel( + const kernel_t &kernel, ze_context_handle_t ctx, + ze_device_handle_t dev); +#endif } // namespace dsl GEMMSTONE_NAMESPACE_END diff --git a/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp b/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp index 4803dcec586..4576dccac05 100644 --- a/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp +++ b/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp @@ -46,7 +46,15 @@ GEMMSTONE_NAMESPACE_START #ifndef GENERATOR_BASE #define GENERATOR_SUPER(hw) ngen::OpenCLCodeGenerator +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL +#define FORWARD(hw) NGEN_FORWARD_SYCL(hw); +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #define FORWARD(hw) NGEN_FORWARD_OPENCL(hw) +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw); +#endif #define GENERATOR_DEBUGINFO {__FILE__, __LINE__} #define GENERATOR_BASE(hw) GENERATOR_SUPER(hw) diff --git a/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp b/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp index 4d30d8b12d3..760a7eb3127 100644 --- a/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp +++ b/src/gpu/intel/gemm/jit/include/gemmstone/runtime.hpp @@ -70,5 +70,10 @@ std::vector make_binary(const GEMMKernelDesc &desc, cl_device_id device cl_kernel make_kernel(const GEMMKernelDesc &desc, cl_device_id device, cl_context context); #endif +#ifdef GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME +std::vector make_binary(const GEMMKernelDesc &desc, ze_device_handle_t device, ze_context_handle_t context); +std::pair make_kernel(const GEMMKernelDesc &desc, ze_device_handle_t device, ze_context_handle_t context); +#endif + GEMMSTONE_NAMESPACE_END #endif diff --git a/src/gpu/intel/jit/binary_format.cpp b/src/gpu/intel/jit/binary_format.cpp index 6460b2e8261..62b03b78afe 100644 --- a/src/gpu/intel/jit/binary_format.cpp +++ b/src/gpu/intel/jit/binary_format.cpp @@ -39,6 +39,16 @@ #define MAGICSIZEY 2 #define MAGICSIZEZ 1 +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL +#define FORWARD(hw) NGEN_FORWARD_SYCL(hw); +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL +#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw) +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw); +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -49,7 +59,7 @@ using namespace ngen; template class binary_format_kernel_t : public generator_t { - NGEN_FORWARD_OPENCL(hw); + FORWARD(hw); public: binary_format_kernel_t() diff --git a/src/gpu/intel/jit/config/gemmstone_config.hpp b/src/gpu/intel/jit/config/gemmstone_config.hpp index 30b2373eaef..6c96640f44f 100644 --- a/src/gpu/intel/jit/config/gemmstone_config.hpp +++ b/src/gpu/intel/jit/config/gemmstone_config.hpp @@ -40,6 +40,9 @@ #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #define OPENCL_OUTPUT #define GEMMSTONE_WITH_OPENCL_RUNTIME +#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#define ZEBIN_OUTPUT +#define GEMMSTONE_WITH_LEVEL_ZERO_RUNTIME #endif #if !defined(NDEBUG) || defined(DNNL_DEV_MODE) diff --git a/src/gpu/intel/jit/generator.hpp b/src/gpu/intel/jit/generator.hpp index 778db6d2515..58cab363db6 100644 --- a/src/gpu/intel/jit/generator.hpp +++ b/src/gpu/intel/jit/generator.hpp @@ -43,6 +43,12 @@ #include "ngen_opencl.hpp" #endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/kernel.hpp" +#include "ngen_level_zero.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -90,6 +96,11 @@ template using ngen_code_generator_t = ngen::OpenCLCodeGenerator; #endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 +template +using ngen_code_generator_t = ngen::LevelZeroCodeGenerator; +#endif + void check_kernel_size(const std::string &kernel_name, size_t kernel_size, const intel::engine_t *engine); @@ -125,6 +136,15 @@ class generator_t : public ngen_code_generator_t, public generator_base_t { auto ocl_kernel = ngen_code_generator_t::getKernel( ocl_engine->context(), ocl_engine->device()); return ocl::kernel_t::make(kernel, ocl_kernel, {}); +#endif +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 + auto *l0_engine = utils::downcast(engine); + auto l0_module_kernel = ngen_code_generator_t::getKernel( + l0_engine->context(), l0_engine->device()); + auto l0_module = std::make_shared( + l0_module_kernel.first); + return l0::kernel_t::make( + kernel, l0_module, l0_module_kernel.second, kernel_name()); #endif } }; diff --git a/src/gpu/intel/l0/CMakeLists.txt b/src/gpu/intel/l0/CMakeLists.txt new file mode 100644 index 00000000000..76df8e68982 --- /dev/null +++ b/src/gpu/intel/l0/CMakeLists.txt @@ -0,0 +1,24 @@ +#=============================================================================== +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB_RECURSE SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp + ) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/gpu/intel/l0/capi/engine.cpp b/src/gpu/intel/l0/capi/engine.cpp new file mode 100644 index 00000000000..907c3dbbb51 --- /dev/null +++ b/src/gpu/intel/l0/capi/engine.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dnnl/dnnl_l0.h" + +#include "common/utils.hpp" +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/engine_factory.hpp" + +using namespace dnnl::impl; + +dnnl_status_t dnnl_l0_interop_engine_create(dnnl_engine_t *engine, + const ze_driver_handle_t adriver, const ze_device_handle_t adevice, + const ze_context_handle_t acontext) { + bool args_ok = !utils::any_null(engine, adriver, adevice, acontext); + if (!args_ok) return status::invalid_arguments; + + gpu::intel::l0::engine_factory_t f(engine_kind::gpu); + + size_t index; + CHECK(gpu::intel::l0::get_device_index(adevice, &index)); + + return f.engine_create(engine, adriver, adevice, acontext, index); +} + +dnnl_status_t dnnl_l0_interop_engine_get_context( + dnnl_engine_t engine, ze_context_handle_t context) { + bool args_ok = !utils::any_null(engine, context) + && (engine->runtime_kind() == runtime_kind::l0); + if (!args_ok) return status::invalid_arguments; + + auto *l0_engine = utils::downcast(engine); + context = l0_engine->context(); + + return status::success; +} + +dnnl_status_t dnnl_l0_interop_engine_get_device( + dnnl_engine_t engine, ze_device_handle_t device) { + bool args_ok = !utils::any_null(engine, device) + && (engine->runtime_kind() == runtime_kind::l0); + if (!args_ok) return status::invalid_arguments; + + auto *l0_engine = utils::downcast(engine); + device = l0_engine->device(); + + return status::success; +} + +dnnl_status_t dnnl_l0_interop_engine_get_driver( + dnnl_engine_t engine, ze_driver_handle_t driver) { + bool args_ok = !utils::any_null(engine, driver) + && (engine->runtime_kind() == runtime_kind::l0); + if (!args_ok) return status::invalid_arguments; + + auto *l0_engine = utils::downcast(engine); + driver = l0_engine->driver(); + + return status::success; +} diff --git a/src/gpu/intel/l0/capi/memory.cpp b/src/gpu/intel/l0/capi/memory.cpp new file mode 100644 index 00000000000..5eacebb6a3d --- /dev/null +++ b/src/gpu/intel/l0/capi/memory.cpp @@ -0,0 +1,128 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dnnl/dnnl_l0.h" + +#include "common/utils.hpp" +#include "gpu/intel/l0/memory_storage.hpp" + +using namespace dnnl::impl; + +dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + void *handle) { + bool ok = !utils::any_null(memory, memory_desc, engine) + && engine->runtime_kind() == runtime_kind::l0; + if (!ok) return status::invalid_arguments; + + auto *l0_engine = utils::downcast(engine); + auto kind = gpu::intel::l0::get_memory_storage_kind( + gpu::intel::l0::get_pointer_type(l0_engine->context(), handle)); + if (handle != DNNL_MEMORY_NONE && handle != DNNL_MEMORY_ALLOCATE + && kind == gpu::intel::l0::memory_storage_kind_t::unknown + && !engine->mayiuse_system_memory_allocators()) + return status::invalid_arguments; + + const auto mdw = memory_desc_wrapper(memory_desc); + if (mdw.format_any() || mdw.has_runtime_dims_or_strides()) + return status::invalid_arguments; + + unsigned flags = (handle == DNNL_MEMORY_ALLOCATE) + ? memory_flags_t::alloc + : memory_flags_t::use_runtime_ptr; + handle = (handle == DNNL_MEMORY_ALLOCATE) ? nullptr : handle; + + std::unique_ptr mem_storage; + mem_storage.reset(new gpu::intel::l0::memory_storage_t( + engine, gpu::intel::l0::memory_storage_kind_t::device)); + if (!mem_storage) return status::out_of_memory; + + CHECK(mem_storage->init( + flags, dnnl_memory_desc_get_size(memory_desc), handle)); + + return safe_ptr_assign( + *memory, new memory_t(engine, memory_desc, std::move(mem_storage))); +} + +dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory, + const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine, + int nhandles, void **handles) { + bool ok = !utils::any_null(memory, memory_desc, engine, handles) + && nhandles > 0 && engine->runtime_kind() == runtime_kind::l0; + if (!ok) return status::invalid_arguments; + + const auto mdw = memory_desc_wrapper(memory_desc); + if (mdw.format_any() || mdw.has_runtime_dims_or_strides()) + return status::invalid_arguments; + + std::vector flags_vec(nhandles); + std::vector handles_vec(nhandles); + for (int i = 0; i < nhandles; i++) { + unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE) + ? memory_flags_t::alloc + : memory_flags_t::use_runtime_ptr; + void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i]; + flags_vec[i] = f; + handles_vec[i] = h; + } + + auto *l0_engine = utils::downcast(engine); + std::vector> mem_storages(nhandles); + for (int i = 0; i < nhandles; i++) { + auto kind = gpu::intel::l0::get_memory_storage_kind( + gpu::intel::l0::get_pointer_type( + l0_engine->context(), handles[i])); + if (handles[i] != DNNL_MEMORY_NONE && handles[i] != DNNL_MEMORY_ALLOCATE + && kind == gpu::intel::l0::memory_storage_kind_t::unknown + && !engine->mayiuse_system_memory_allocators()) { + return status::invalid_arguments; + } + size_t sz = dnnl_memory_desc_get_size_v2(memory_desc, i); + mem_storages[i].reset(new gpu::intel::l0::memory_storage_t( + engine, gpu::intel::l0::memory_storage_kind_t::device)); + if (!mem_storages[i]) return status::out_of_memory; + CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i])); + } + + return safe_ptr_assign(*memory, + new memory_t(engine, memory_desc, std::move(mem_storages))); +} + +dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object( + const memory_t *memory, void **mem_object) { + if (utils::any_null(mem_object)) return status::invalid_arguments; + + if (!memory) { + mem_object = nullptr; + return status::success; + } + bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0); + if (!args_ok) return status::invalid_arguments; + + void *handle; + status_t status = memory->get_data_handle(&handle); + if (status == status::success) mem_object = &handle; + + return status; +} + +dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object( + memory_t *memory, void *mem_object) { + bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0); + if (!args_ok) return status::invalid_arguments; + + return memory->set_data_handle(mem_object); +} diff --git a/src/gpu/intel/l0/capi/primitive.cpp b/src/gpu/intel/l0/capi/primitive.cpp new file mode 100644 index 00000000000..e48ab655d1d --- /dev/null +++ b/src/gpu/intel/l0/capi/primitive.cpp @@ -0,0 +1,67 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dnnl/dnnl_l0.h" + +#include "common/primitive_desc_iface.hpp" +#include "common/primitive_iface.hpp" +#include "common/utils.hpp" +#include "gpu/intel/l0/stream.hpp" + +using namespace dnnl::impl; + +dnnl_status_t dnnl_l0_interop_primitive_execute( + const primitive_iface_t *primitive_iface, dnnl_stream_t stream, + int nargs, const dnnl_exec_arg_t *args, int ndeps, + const ze_event_handle_t *deps, ze_event_handle_t *return_event) { + const bool ok = !utils::any_null(primitive_iface, stream) + && primitive_iface->engine() == stream->engine() + && primitive_iface->engine()->runtime_kind() == runtime_kind::l0 + && IMPLICATION(nargs > 0, args != nullptr) + && IMPLICATION(ndeps > 0, deps != nullptr); + if (!ok) return status::invalid_arguments; + + auto *l0_stream = utils::downcast(stream); + stream->before_exec_hook(); + + if (deps != nullptr) { + std::vector events(ndeps); + for (int i = 0; i < ndeps; i++) + events[i] = deps[i]; + l0_stream->l0_ctx().set_deps(events); + } + + // run primitive + exec_args_t exec_args; + CHECK(cvt_primitive_args(primitive_iface->pd()->impl().get(), + static_cast(nargs), args, exec_args)); + + exec_ctx_t ctx(stream, std::move(exec_args)); + CHECK(primitive_execute(primitive_iface, ctx)); + + // return output event + if (return_event != nullptr) { + if (l0_stream->impl()->flags() & stream_flags::in_order) { + *return_event = nullptr; + } else { + *return_event = l0_stream->get_output_event(); + } + } + + stream->after_exec_hook(); + + return status::success; +} diff --git a/src/gpu/intel/l0/capi/stream.cpp b/src/gpu/intel/l0/capi/stream.cpp new file mode 100644 index 00000000000..44a62a29d87 --- /dev/null +++ b/src/gpu/intel/l0/capi/stream.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dnnl/dnnl_l0.h" + +#include "common/utils.hpp" +#include "gpu/intel/l0/stream.hpp" + +using namespace dnnl::impl; + +dnnl_status_t dnnl_l0_interop_stream_create(dnnl_stream_t *stream, + dnnl_engine_t engine, ze_command_list_handle_t list, bool profiling) { + bool args_ok = !utils::any_null(stream, engine, list) + && engine->runtime_kind() == runtime_kind::l0; + if (!args_ok) return status::invalid_arguments; + + unsigned flags = stream_flags::default_flags; + if (profiling) { +#ifdef DNNL_EXPERIMENTAL_PROFILING + flags |= stream_flags::profiling; +#endif + } + + std::unique_ptr stream_impl( + new gpu::intel::l0::stream_impl_t(flags, list)); + if (!stream_impl) return status::out_of_memory; + + CHECK(engine->create_stream(stream, stream_impl.get())); + stream_impl.release(); + + return status::success; +} + +dnnl_status_t dnnl_l0_interop_stream_get_list( + dnnl_stream_t stream, ze_command_list_handle_t list) { + bool args_ok = !utils::any_null(list, stream) + && stream->engine()->runtime_kind() == runtime_kind::l0; + if (!args_ok) return status::invalid_arguments; + + auto *l0_stream = utils::downcast(stream); + list = l0_stream->list(); + + return status::success; +} diff --git a/src/gpu/intel/l0/compiler.hpp b/src/gpu/intel/l0/compiler.hpp new file mode 100644 index 00000000000..f49659cdca3 --- /dev/null +++ b/src/gpu/intel/l0/compiler.hpp @@ -0,0 +1,80 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_COMPILER_HPP +#define GPU_INTEL_L0_COMPILER_HPP + +#include "xpu/ocl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +static inline cl_program ocl_compile(const cl_device_id device, + const cl_context context, const char *kernel_code, + const char *options) { + cl_int err; + cl_program program = xpu::ocl::clCreateProgramWithSource( + context, 1, &kernel_code, nullptr, &err); + if (err != CL_SUCCESS) return nullptr; + err = xpu::ocl::clBuildProgram( + program, 1, &device, options, nullptr, nullptr); + if (err != CL_SUCCESS) { + xpu::ocl::clReleaseProgram(program); + return nullptr; + } + return program; +} + +inline bool ocl_mayiuse_microkernels(const cl_device_id device, + const cl_context context, const char *kernel_code) { + cl_program program = ocl_compile(device, context, kernel_code, ""); + if (program) { + xpu::ocl::clReleaseProgram(program); + return true; + } + return false; +} + +inline status_t ocl_build_kernels(const cl_device_id device, + const cl_context context, const char *kernel_code, const char *options, + xpu::binary_t &binary) { + cl_program program = ocl_compile(device, context, kernel_code, options); + if (!program) return status::runtime_error; + + size_t binary_size = 0; + OCL_CHECK(xpu::ocl::clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, + sizeof(binary_size), &binary_size, nullptr)); + + binary.resize(binary_size); + auto binary_data = binary.data(); + OCL_CHECK(xpu::ocl::clGetProgramInfo(program, CL_PROGRAM_BINARIES, + sizeof(binary_data), &binary_data, nullptr)); + + OCL_CHECK(xpu::ocl::clReleaseProgram(program)); + + return status::success; +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_COMPILER_HPP diff --git a/src/gpu/intel/l0/context.hpp b/src/gpu/intel/l0/context.hpp new file mode 100644 index 00000000000..902fc4ff778 --- /dev/null +++ b/src/gpu/intel/l0/context.hpp @@ -0,0 +1,103 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_CONTEXT_HPP +#define GPU_INTEL_L0_CONTEXT_HPP + +#include "gpu/intel/l0/utils/utils.hpp" +#include "xpu/context.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +struct event_t : public xpu::event_t { + event_t() = default; + event_t(const event_t &) = default; + event_t(const std::vector &event) : events_(event) {} + event_t(std::vector &&event) + : events_(std::move(event)) {} + event_t(ze_event_handle_t &&event) { + events_.emplace_back(std::move(event)); + } + ~event_t() override = default; + + event_t &operator=(event_t &&other) { + std::swap(events_, other.events_); + return *this; + } + event_t &operator=(const event_t &other) { + events_ = other.events_; + return *this; + } + + const ze_event_handle_t &operator[](size_t i) const { return events_[i]; } + ze_event_handle_t &operator[](size_t i) { return events_[i]; } + size_t size() const { return events_.size(); } + + static event_t &from(xpu::event_t &event) { + return *utils::downcast(&event); + } + static const event_t &from(const xpu::event_t &event) { + return *utils::downcast(&event); + } + std::unique_ptr clone() const override { + return std::unique_ptr(new event_t(*this)); + } + void append(const xpu::event_t &event) { + auto &other = *utils::downcast(&event); + events_.insert( + events_.end(), other.events_.begin(), other.events_.end()); + } + + std::vector events_; +}; + +class context_t final : public xpu::context_t { +public: + context_t() = default; + ~context_t() override = default; + + context_t &operator=(const context_t &other) { + events_ = other.events_; + return *this; + } + void set_deps(std::vector &&event) { + events_ = event_t(event); + } + void set_deps(event_t &&events) { events_ = std::move(events); } + + xpu::event_t &get_deps() override { return events_; } + const xpu::event_t &get_deps() const override { return events_; } + void append_deps(const xpu::event_t &event) override { + events_.append(event); + } + + status_t get_event(ze_event_handle_t *new_event); + +private: + event_t events_; +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_CONTEXT_HPP diff --git a/src/gpu/intel/l0/device_info.cpp b/src/gpu/intel/l0/device_info.cpp new file mode 100644 index 00000000000..42b9e042363 --- /dev/null +++ b/src/gpu/intel/l0/device_info.cpp @@ -0,0 +1,148 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/device_info.hpp" +#include "gpu/intel/l0/compiler.hpp" +#include "gpu/intel/l0/engine.hpp" +#include "ngen_level_zero.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +status_t device_info_t::init_arch(impl::engine_t *engine) { + auto *l0_engine = utils::downcast(engine); + auto context = l0_engine->context(); + auto device = l0_engine->device(); + + return init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_, + gpu_product_, native_extensions_, mayiuse_systolic_, + mayiuse_ngen_kernels_); +} + +status_t device_info_t::init_device_name(impl::engine_t *engine) { + auto *l0_engine = utils::downcast(engine); + auto device = l0_engine->device(); + + ze_device_properties_t device_properties = {}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + device_properties.pNext = nullptr; + + CHECK(l0::zeDeviceGetProperties(device, &device_properties)); + name_ = std::string(device_properties.name); + + return status::success; +} + +status_t device_info_t::init_runtime_version(impl::engine_t *engine) { + auto *l0_engine = utils::downcast(engine); + auto driver = l0_engine->driver(); + + ze_driver_properties_t driver_properties = {}; + driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES; + driver_properties.pNext = nullptr; + + l0::zeDriverGetProperties(driver, &driver_properties); + + runtime_version_.major + = (driver_properties.driverVersion & 0xFF000000) >> 24; + runtime_version_.minor + = (driver_properties.driverVersion & 0x00FF0000) >> 16; + runtime_version_.build = driver_properties.driverVersion & 0x0000FFFF; + + return status::success; +} + +status_t device_info_t::init_extensions(impl::engine_t *engine) { + std::string extension_string; + // TODO: using OpenCL runtime becuse Level Zero runtime does not provide + // this information. + auto *l0_engine = utils::downcast(engine); + CHECK(xpu::ocl::get_extensions(l0_engine->ocl_device(), extension_string)); + + for (uint64_t i_ext = 1; i_ext < (uint64_t)compute::device_ext_t::last; + i_ext <<= 1) { + const char *s_ext = ext2cl_str((compute::device_ext_t)i_ext); + + if (s_ext && extension_string.find(s_ext) != std::string::npos) { + extensions_ |= i_ext; + } + } + + extensions_ + |= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic()); + + return status::success; +} + +status_t device_info_t::init_attributes(impl::engine_t *engine) { + auto *l0_engine = utils::downcast(engine); + auto device = l0_engine->device(); + + ze_device_properties_t device_properties = {}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + device_properties.pNext = nullptr; + + CHECK(l0::zeDeviceGetProperties(device, &device_properties)); + + eu_count_ = device_properties.numSlices + * device_properties.numSubslicesPerSlice + * device_properties.numEUsPerSubslice; + + ze_device_compute_properties_t device_compute_properties = {}; + device_compute_properties.stype + = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES; + device_compute_properties.pNext = nullptr; + + CHECK(l0::zeDeviceGetComputeProperties(device, &device_compute_properties)); + + max_wg_size_ = device_compute_properties.maxTotalGroupSize; + + uint32_t device_cache_properties_count = 0; + CHECK(l0::zeDeviceGetCacheProperties( + device, &device_cache_properties_count, nullptr)); + + std::vector device_cache_properties( + device_cache_properties_count); + for (ze_device_cache_properties_t &p : device_cache_properties) { + p.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES; + p.pNext = nullptr; + } + + CHECK(l0::zeDeviceGetCacheProperties(device, &device_cache_properties_count, + device_cache_properties.data())); + l3_cache_size_ = device_cache_properties[0].cacheSize; + + ze_device_memory_access_properties_t device_memory_access_properties = {}; + device_memory_access_properties.stype + = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES; + device_memory_access_properties.pNext = nullptr; + + l0::zeDeviceGetMemoryAccessProperties( + device, &device_memory_access_properties); + mayiuse_system_memory_allocators_ + = device_memory_access_properties.sharedSystemAllocCapabilities; + + return status::success; +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/device_info.hpp b/src/gpu/intel/l0/device_info.hpp new file mode 100644 index 00000000000..607ab798a27 --- /dev/null +++ b/src/gpu/intel/l0/device_info.hpp @@ -0,0 +1,44 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_DEVICE_INFO_HPP +#define GPU_INTEL_L0_DEVICE_INFO_HPP + +#include "gpu/intel/compute/device_info.hpp" +#include "gpu/intel/l0/utils/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +class device_info_t : public compute::device_info_t { +protected: + status_t init_device_name(impl::engine_t *engine) override; + status_t init_arch(impl::engine_t *engine) override; + status_t init_runtime_version(impl::engine_t *engine) override; + status_t init_extensions(impl::engine_t *engine) override; + status_t init_attributes(impl::engine_t *engine) override; +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_DEVICE_INFO_HPP diff --git a/src/gpu/intel/l0/engine.cpp b/src/gpu/intel/l0/engine.cpp new file mode 100644 index 00000000000..043664e132a --- /dev/null +++ b/src/gpu/intel/l0/engine.cpp @@ -0,0 +1,325 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/compiler.hpp" +#include "gpu/intel/l0/device_info.hpp" +#include "gpu/intel/l0/kernel.hpp" +#include "gpu/intel/l0/memory_storage.hpp" +#include "gpu/intel/l0/stream.hpp" + +#include "gemmstone/dsl/runtime.hpp" +#include "gpu/intel/compute/ukernels.hpp" +#include "gpu/intel/jit/generator.hpp" +#include "gpu/intel/microkernels/fuser.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +class engine_impl_t : public impl::engine_impl_t { +public: + engine_impl_t(engine_kind_t kind, const ze_driver_handle_t driver, + const ze_device_handle_t device, const ze_context_handle_t context, + size_t index) + : impl::engine_impl_t(kind, runtime_kind::l0, index) + , driver_(driver) + , device_(device) + , context_(context) { + cl_int err; + std::vector ocl_devices; + xpu::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); + + ocl_device_ = nullptr; + ocl_context_ = nullptr; + xpu::device_uuid_t l0_dev_uuid = get_device_uuid(device); + for (const cl_device_id &d : ocl_devices) { + xpu::device_uuid_t ocl_dev_uuid; + xpu::ocl::get_device_uuid(ocl_dev_uuid, d); + if (l0_dev_uuid == ocl_dev_uuid) { + ocl_device_ = xpu::ocl::make_wrapper(d); + ocl_context_ = xpu::ocl::make_wrapper( + xpu::ocl::clCreateContext(nullptr, 1, + &ocl_device_.unwrap(), nullptr, nullptr, &err)); + } + } + } + ~engine_impl_t() override { l0::zeContextDestroy(context_); } + + const ze_driver_handle_t driver() const { return driver_; } + const ze_device_handle_t device() const { return device_; } + const ze_context_handle_t context() const { return context_; } + + const xpu::ocl::wrapper_t ocl_device() const { + return ocl_device_; + } + const xpu::ocl::wrapper_t ocl_context() const { + return ocl_context_; + } + + status_t create_stream_impl( + impl::stream_impl_t **stream_impl, unsigned flags) const override { + auto *si = new stream_impl_t(flags, context_, device_); + if (!si) return status::out_of_memory; + + *stream_impl = si; + + return status::success; + } + + status_t create_memory_storage(impl::memory_storage_t **storage, + impl::engine_t *engine, unsigned flags, size_t size, + void *handle) const override { + std::unique_ptr _storage; + _storage.reset( + new memory_storage_t(engine, memory_storage_kind_t::device)); + if (!_storage) return status::out_of_memory; + + status_t status = _storage->init(flags, size, handle); + if (status != status::success) return status; + + *storage = _storage.release(); + + return status::success; + } + + engine_id_t engine_id() const override { + return engine_id_t(new engine_id_impl_t( + device(), context(), kind(), runtime_kind(), index())); + } + + int get_buffer_alignment() const override { return 128; } + +private: + ze_driver_handle_t driver_; + ze_device_handle_t device_; + ze_context_handle_t context_; + + xpu::ocl::wrapper_t ocl_device_; + xpu::ocl::wrapper_t ocl_context_; + + engine_impl_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(engine_impl_t); +}; + +status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind, + const ze_driver_handle_t dri, const ze_device_handle_t dev, + const ze_context_handle_t ctx, size_t index) { + std::unique_ptr e( + (new gpu::intel::l0::engine_t(dri, dev, ctx, index))); + if (!e) return status::out_of_memory; + + CHECK(e->init()); + *engine = e.release(); + + return status::success; +} + +engine_t::engine_t(ze_driver_handle_t driver, ze_device_handle_t device, + ze_context_handle_t context, size_t index) + : gpu::intel::engine_t(new engine_impl_t( + engine_kind::gpu, driver, device, context, index)) {} + +status_t engine_t::init() { + CHECK(init_impl()); + CHECK(gpu::intel::engine_t::init()); + + return status::success; +} + +status_t engine_t::create_stream( + impl::stream_t **stream, impl::stream_impl_t *stream_impl) { + return gpu::intel::l0::stream_t::create_stream(stream, this, stream_impl); +} + +status_t engine_t::create_kernel( + compute::kernel_t *kernel, jit::generator_base_t *jitter) const { + if (kind() != engine_kind::gpu) { + assert(!"not expected"); + return status::invalid_arguments; + } + return jitter->get_kernel(*kernel, this); +} + +status_t engine_t::create_kernel(compute::kernel_t &kernel, + const gemmstone::dsl::kernel_t &kernel_dsl) const { + const auto &module_kernel_pair + = gemmstone::dsl::make_kernel(kernel_dsl, context(), device()); + auto l0_module_ptr + = std::make_shared(module_kernel_pair.first); + + return kernel_t::make(kernel, l0_module_ptr, module_kernel_pair.second, {}); +} + +status_t engine_t::convert_to_l0( + std::vector &kernels, + const std::vector &kernel_names, + xpu::binary_t &binary) const { + ze_module_handle_t l0_module = nullptr; + std::vector l0_kernels; + CHECK(gpu::intel::l0::create_kernels( + device(), context(), kernel_names, binary, &l0_module, l0_kernels)); + auto l0_module_ptr = std::make_shared(l0_module); + + kernels = std::vector(kernel_names.size()); + for (size_t i = 0; i < kernel_names.size(); i++) { + if (!l0_kernels[i]) continue; + CHECK(kernel_t::make( + kernels[i], l0_module_ptr, l0_kernels[i], kernel_names[i])); + } + + return status::success; +} + +status_t engine_t::create_kernels(std::vector *kernels, + const std::vector &kernel_names, + const compute::kernel_ctx_t &kernel_ctx) const { + if (kind() != engine_kind::gpu) { + assert(!"not expected"); + return status::invalid_arguments; + } + + const char *source = nullptr; + for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++) + source = intel::get_kernel_source(kernel_names[i]); + + std::string options = kernel_ctx.options(); + auto *dev_info = utils::downcast(device_info()); + options += " " + dev_info->get_cl_ext_options(); + + stringstream_t code_ss; + CHECK(compute::preprocess_headers(code_ss, source, kernel_ctx)); + std::string code = code_ss.str(); + + gpu::intel::compute::program_src_t src(code); + if (src) { options += " -g -s " + std::string(src.name()); } + + compute::debugdump_processed_source( + code, options, dev_info->get_cl_ext_options()); + + const char *code_c = code.c_str(); + xpu::binary_t binary; + if (l0::compile_ocl_module_to_binary( + device(), context(), code, options, binary) + != status::success) + CHECK(ocl_build_kernels( + ocl_device(), ocl_context(), code_c, options.c_str(), binary)); + + if (kernel_ctx.has_custom_headers() && micro::hasMicrokernels(code_c)) { + try { + micro::fuseMicrokernels(binary, code_c); + } catch (...) { return status::runtime_error; } + } + + CHECK(convert_to_l0(*kernels, kernel_names, binary)); + + return status::success; +} + +status_t engine_t::create_kernel_from_binary(compute::kernel_t &kernel, + const xpu::binary_t &binary, const char *kernel_name, + const compute::program_src_t &src) const { + std::vector kernel_names = {kernel_name}; + ze_module_handle_t l0_module = nullptr; + std::vector l0_kernels; + CHECK(gpu::intel::l0::create_kernels( + device(), context(), kernel_names, binary, &l0_module, l0_kernels)); + auto l0_module_ptr = std::make_shared(l0_module); + + CHECK(kernel_t::make(kernel, l0_module_ptr, l0_kernels[0], kernel_name)); + + return status::success; +} + +status_t engine_t::create_kernels_from_cache_blob( + const cache_blob_t &cache_blob, std::vector &kernels, + const std::vector &kernel_names) const { + if (kind() != engine_kind::gpu) { + assert(!"not expected"); + return status::invalid_arguments; + } + + kernels = std::vector(kernel_names.size()); + for (size_t i = 0; i < kernel_names.size(); i++) { + if (!kernel_names[i] && kernel_names.size() > 1) continue; + std::string kernel_name(kernel_names[i] ? kernel_names[i] : ""); + + const uint8_t *binary_data = nullptr; + size_t binary_size = 0; + CHECK(cache_blob.get_binary(&binary_data, &binary_size)); + + xpu::binary_t binary(binary_data, binary_data + binary_size); + CHECK(create_kernel_from_binary(kernels[i], binary, kernel_names[i], + gpu::intel::compute::program_src_t())); + } + + return status::success; +} + +gpu_utils::device_id_t engine_t::device_id() const { + return std::tuple_cat( + std::make_tuple(1), gpu::intel::l0::get_device_uuid(device())); +} + +const ze_driver_handle_t engine_t::driver() const { + return static_cast(impl())->driver(); +} + +const ze_device_handle_t engine_t::device() const { + return static_cast(impl())->device(); +} + +const ze_context_handle_t engine_t::context() const { + return static_cast(impl())->context(); +} + +const cl_device_id engine_t::ocl_device() const { + return static_cast(impl())->ocl_device(); +} + +const cl_context engine_t::ocl_context() const { + return static_cast(impl())->ocl_context(); +} + +bool engine_t::mayiuse_microkernels() const { + if (!l0::mayiuse_microkernels(device(), context(), + std::string(compute::cl_microkernels_check_kernel_code))) { + return ocl_mayiuse_microkernels(ocl_device(), ocl_context(), + compute::cl_microkernels_check_kernel_code); + } + return true; +} + +status_t engine_t::init_device_info() { + device_info_ = std::make_shared(); + CHECK(device_info_->init(this)); + + return status::success; +} + +status_t engine_t::init_device_info(const std::vector &cache_blob) { + gpu_assert(false) << "unimplemented function init_device_info() called"; + + return status::runtime_error; +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/engine.hpp b/src/gpu/intel/l0/engine.hpp new file mode 100644 index 00000000000..85bf4fb3538 --- /dev/null +++ b/src/gpu/intel/l0/engine.hpp @@ -0,0 +1,119 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_ENGINE_HPP +#define GPU_INTEL_L0_ENGINE_HPP + +#include "gpu/intel/engine.hpp" +#include "gpu/intel/l0/utils/utils.hpp" +#include "xpu/ocl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +struct engine_id_impl_t : public impl::engine_id_impl_t { + engine_id_impl_t(const ze_device_handle_t device, + const ze_context_handle_t context, engine_kind_t kind, + runtime_kind_t runtime_kind, size_t index) + : impl::engine_id_impl_t(kind, runtime_kind, index) + , device_(device) + , context_(context) {} + ~engine_id_impl_t() override = default; + +private: + bool compare_resource( + const impl::engine_id_impl_t *id_impl) const override { + const auto *typed_id + = utils::downcast(id_impl); + return device_ == typed_id->device_ && context_ == typed_id->context_; + } + + size_t hash_resource() const override { + size_t seed = 0; + seed = hash_combine(seed, device_); + seed = hash_combine(seed, context_); + return seed; + } + + ze_device_handle_t device_; + ze_context_handle_t context_; + + engine_id_impl_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(engine_id_impl_t); +}; + +status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind, + const ze_driver_handle_t dri, const ze_device_handle_t dev, + const ze_context_handle_t ctx, size_t index); + +class engine_t : public intel::engine_t { +public: + engine_t(ze_driver_handle_t driver, ze_device_handle_t device, + ze_context_handle_t context, size_t index); + ~engine_t() override = default; + + status_t init() override; + + status_t create_stream( + impl::stream_t **stream, impl::stream_impl_t *stream_impl) override; + + status_t create_kernel(compute::kernel_t *kernel, + jit::generator_base_t *jitter) const override; + status_t create_kernel(compute::kernel_t &kernel, + const jit::dsl::kernel_t &kernel_ir) const override; + status_t create_kernels(std::vector *kernels, + const std::vector &kernel_names, + const compute::kernel_ctx_t &kernel_ctx) const override; + status_t create_kernel_from_binary(compute::kernel_t &kernel, + const xpu::binary_t &binary, const char *kernel_name, + const compute::program_src_t &src) const override; + status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob, + std::vector &kernels, + const std::vector &kernel_names) const override; + + gpu::intel::gpu_utils::device_id_t device_id() const override; + + const ze_driver_handle_t driver() const; + const ze_device_handle_t device() const; + const ze_context_handle_t context() const; + + const cl_device_id ocl_device() const; + const cl_context ocl_context() const; + + bool mayiuse_microkernels() const; + +private: + status_t init_device_info() override; + status_t init_device_info(const std::vector &cache_blob) override; + + status_t convert_to_l0(std::vector &kernels, + const std::vector &kernel_names, + xpu::binary_t &binary) const; + + engine_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(engine_t); +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_ENGINE_HPP diff --git a/src/gpu/intel/l0/engine_factory.cpp b/src/gpu/intel/l0/engine_factory.cpp new file mode 100644 index 00000000000..e4a3b67d659 --- /dev/null +++ b/src/gpu/intel/l0/engine_factory.cpp @@ -0,0 +1,88 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/engine_factory.hpp" +#include "gpu/intel/l0/engine.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +engine_factory_t::engine_factory_t(engine_kind_t engine_kind) + : engine_kind_(engine_kind) { + assert(utils::one_of(engine_kind_, engine_kind::gpu)); +} + +size_t engine_factory_t::count() const { + uint32_t driver_count = 0; + l0::zeDriverGet(&driver_count, nullptr); + + std::vector drivers(driver_count); + l0::zeDriverGet(&driver_count, drivers.data()); + + uint32_t device_count = 0; + l0::zeDeviceGet(drivers[0], &device_count, nullptr); + + return device_count; +} + +status_t engine_factory_t::engine_create( + impl::engine_t **engine, size_t index) const { + ze_driver_handle_t driver = nullptr; + ze_device_handle_t device = nullptr; + ze_context_handle_t context = nullptr; + + uint32_t driver_count = 0; + CHECK(l0::zeDriverGet(&driver_count, nullptr)); + + std::vector drivers(driver_count); + CHECK(l0::zeDriverGet(&driver_count, drivers.data())); + driver = drivers[0]; + + uint32_t device_count = 0; + CHECK(l0::zeDeviceGet(driver, &device_count, nullptr)); + VERROR_ENGINE(index < device_count, status::invalid_arguments, + "asked for device %zu but only %u devices are found", index, + device_count); + + std::vector devices(device_count); + CHECK(l0::zeDeviceGet(driver, &device_count, devices.data())); + device = devices[index]; + + ze_context_desc_t context_desc = {}; + context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC; + context_desc.pNext = nullptr; + context_desc.flags = 0; + + CHECK(l0::zeContextCreate(driver, &context_desc, &context)); + + return engine_create(engine, driver, device, context, index); +} + +status_t engine_factory_t::engine_create(impl::engine_t **engine, + const ze_driver_handle_t driver, const ze_device_handle_t device, + const ze_context_handle_t context, size_t index) const { + return gpu::intel::l0::engine_create( + engine, engine_kind_, driver, device, context, index); +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/engine_factory.hpp b/src/gpu/intel/l0/engine_factory.hpp new file mode 100644 index 00000000000..485eb7c8da1 --- /dev/null +++ b/src/gpu/intel/l0/engine_factory.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_ENGINE_FACTORY_HPP +#define GPU_INTEL_L0_ENGINE_FACTORY_HPP + +#include "common/engine.hpp" +#include "gpu/intel/l0/utils/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +class engine_factory_t : public impl::engine_factory_t { +public: + engine_factory_t(engine_kind_t engine_kind); + ~engine_factory_t() override = default; + + size_t count() const override; + status_t engine_create( + impl::engine_t **engine, size_t index) const override; + status_t engine_create(impl::engine_t **engine, + const ze_driver_handle_t adriver, const ze_device_handle_t adevice, + const ze_context_handle_t acontext, size_t index) const; + +private: + engine_kind_t engine_kind_; + + engine_factory_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(engine_factory_t); +}; + +inline std::unique_ptr get_engine_factory( + engine_kind_t engine_kind) { + return std::unique_ptr(new engine_factory_t(engine_kind)); +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_ENGINE_FACTORY_HPP diff --git a/src/gpu/intel/l0/kernel.cpp b/src/gpu/intel/l0/kernel.cpp new file mode 100644 index 00000000000..cb879a5eb13 --- /dev/null +++ b/src/gpu/intel/l0/kernel.cpp @@ -0,0 +1,215 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/kernel.hpp" +#include "gpu/intel/l0/context.hpp" +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/memory_storage.hpp" +#include "gpu/intel/l0/stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +// This class is to get around std::make_shared requirement to have a public +// constructor. We keep the original constructor as private but expose it here +// to use with std::make_shared. +class kernel_compat_t : public kernel_t { +public: + template + kernel_compat_t(Args &&...args) : kernel_t(std::forward(args)...) {} +}; + +status_t kernel_t::make(compute::kernel_t &compute_kernel, + const std::shared_ptr module_ptr, + const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name) { + compute_kernel = compute::kernel_t(std::make_shared( + module_ptr, kernel_ptr, kernel_name)); + return status::success; +} + +kernel_t::kernel_t(const std::shared_ptr module_ptr, + const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name) + : module_(module_ptr), kernel_(kernel_ptr), kernel_name_(kernel_name) {} + +kernel_t::~kernel_t() { + l0::zeKernelDestroy(kernel_); +} + +status_t kernel_t::check_alignment( + const compute::kernel_arg_list_t &arg_list) const { + for (int i = 0; i < arg_list.nargs(); ++i) { + auto &arg = arg_list.get(i); + if (!arg.is_global()) continue; + + auto *mem_storage = static_cast(arg.value()); + if (!*mem_storage) continue; + + CHECK(compute::kernel_impl_t::check_alignment( + mem_storage->data_handle(), i)); + } + + return status::success; +} + +status_t kernel_t::set_arg( + int arg_index, size_t arg_size, const void *arg_value) const { + return l0::zeKernelSetArgumentValue( + kernel_, arg_index, arg_size, arg_value); +} + +status_t kernel_t::parallel_for(impl::stream_t &stream, + const compute::nd_range_t &range, + const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps, + xpu::event_t &out_dep) { + CHECK(check_scalar_arguments(arg_list)); + CHECK(check_alignment(arg_list)); + + auto l0_stream = utils::downcast(&stream); + auto l0_engine = l0_stream->l0_engine(); + auto l0_device_info = l0_engine->device_info(); + + const size_t pointer_size = l0_device_info->device_address_bits() / 8; + + size_t param_bytes = 0; + for (int i = 0; i < arg_list.nargs(); ++i) { + auto &arg = arg_list.get(i); + if (arg.is_global()) { + auto *mem_storage + = static_cast(arg.value()); + if (!mem_storage->is_null()) { + auto memory_storage_ctx + = utils::downcast(mem_storage->engine()) + ->context(); + if (l0_engine->context() != memory_storage_ctx) { + VERROR(primitive, gpu, + "mismatched Level Zero context for " + "primitive/memory"); + return status::invalid_arguments; + } + + void *ptr = mem_storage->ptr(); + CHECK(set_arg(i, pointer_size, &ptr)); + param_bytes += pointer_size; + } else { + CHECK(set_arg(i, pointer_size, nullptr)); + param_bytes += pointer_size; + } + } else if (arg.is_local()) { + CHECK(set_arg(i, arg.size(), arg.value())); + param_bytes += pointer_size; + } else { + CHECK(set_arg(i, arg.size(), arg.value())); + param_bytes += arg.size(); + } + } + if (param_bytes > l0_device_info->max_kernel_param_size()) { + VERROR(primitive, gpu, + "parameter bytes requirements greater than device supports"); + return status::invalid_arguments; + } + + if (range.is_zero()) { return status::success; } + + std::vector global_size(3, 1); + switch (range.global_range().ndims()) { + case 3: global_size[2] = static_cast(range.global_range()[2]); + case 2: global_size[1] = static_cast(range.global_range()[1]); + case 1: + global_size[0] = static_cast(range.global_range()[0]); + break; + default: + VERROR(primitive, gpu, + "incorrect number of global range dimensions"); + return status::invalid_arguments; + } + + std::vector group_size(3, 1); + if (range.local_range()) { + switch (range.local_range().ndims()) { + case 3: + group_size[2] = static_cast(range.local_range()[2]); + case 2: + group_size[1] = static_cast(range.local_range()[1]); + case 1: + group_size[0] = static_cast(range.local_range()[0]); + break; + default: + VERROR(primitive, gpu, + "incorrect number of local range dimensions"); + return status::invalid_arguments; + } + } else { + CHECK(l0::zeKernelSuggestGroupSize(kernel_, global_size[0], + global_size[1], global_size[2], &group_size[0], &group_size[1], + &group_size[2])); + } + + for (size_t i = 0; i < global_size.size(); i++) { + if (global_size[i] % group_size[i] != 0) { + VERROR(primitive, gpu, "only uniform work-groups are supported"); + return status::invalid_arguments; + } + } + + CHECK(l0::zeKernelSetGroupSize( + kernel_, group_size[0], group_size[1], group_size[2])); + ze_group_count_t group_count = {global_size[0] / group_size[0], + global_size[1] / group_size[1], global_size[2] / group_size[2]}; + + std::vector l0_deps + = utils::downcast(&deps)->events_; + std::vector l0_out_deps + = utils::downcast(&out_dep)->events_; + + event_ = l0_stream->create_event(); + ze_event_handle_t out_event = *(event_.get()); + + CHECK(l0::zeCommandListAppendLaunchKernel(l0_stream->list(), kernel_, + &group_count, out_event, static_cast(l0_deps.size()), + l0_deps.size() ? l0_deps.data() : nullptr)); + + if (out_event) l0_out_deps.push_back(out_event); + if (stream.is_profiling_enabled()) { + l0_stream->profiler().register_event( + utils::make_unique(std::move(out_event))); + } + + return status::success; +} + +status_t kernel_t::get_kernel_binary(xpu::binary_t &binary) const { + return l0::get_kernel_binary(kernel_, binary); +} + +std::string kernel_t::name() const { + return kernel_name_; +} + +status_t kernel_t::dump() const { + xpu::binary_t binary; + CHECK(get_kernel_binary(binary)); + + return gpu_utils::dump_kernel_binary(binary, kernel_name_); +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/kernel.hpp b/src/gpu/intel/l0/kernel.hpp new file mode 100644 index 00000000000..0a6ef6631cd --- /dev/null +++ b/src/gpu/intel/l0/kernel.hpp @@ -0,0 +1,76 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_KERNEL_HPP +#define GPU_INTEL_L0_KERNEL_HPP + +#include + +#include "common/rw_mutex.hpp" +#include "gpu/intel/compute/kernel.hpp" +#include "gpu/intel/l0/utils/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +class kernel_t : public compute::kernel_impl_t { +public: + static status_t make(compute::kernel_t &compute_kernel, + const std::shared_ptr module_ptr, + const ze_kernel_handle_t kernel_ptr, + const std::string &kernel_name); + ~kernel_t() override; + + status_t check_alignment( + const compute::kernel_arg_list_t &arg_list) const override; + status_t set_arg( + int arg_index, size_t arg_size, const void *arg_value) const; + status_t parallel_for(impl::stream_t &stream, + const compute::nd_range_t &range, + const compute::kernel_arg_list_t &arg_list, + const xpu::event_t &deps, xpu::event_t &out_dep) override; + + status_t get_kernel_binary(xpu::binary_t &binary) const override; + std::string name() const override; + status_t dump() const override; + +private: + friend class kernel_compat_t; + kernel_t(const std::shared_ptr module_ptr, + const ze_kernel_handle_t kernel_ptr, + const std::string &kernel_name); + + std::shared_ptr module_; + ze_kernel_handle_t kernel_; + std::string kernel_name_; + + std::shared_ptr event_pool_; + std::shared_ptr event_; + + kernel_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(kernel_t); +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_KERNEL_HPP diff --git a/src/gpu/intel/l0/memory_storage.cpp b/src/gpu/intel/l0/memory_storage.cpp new file mode 100644 index 00000000000..e2ff707dc6d --- /dev/null +++ b/src/gpu/intel/l0/memory_storage.cpp @@ -0,0 +1,215 @@ +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/memory_storage.hpp" +#include "common/memory_map_manager.hpp" +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +status_t memory_storage_t::get_data_handle(void **handle) const { + *handle = ptr_.get(); + + return status::success; +} + +status_t memory_storage_t::set_data_handle(void *handle) { + ptr_ = decltype(ptr_)(handle, [](void *) {}); + kind_ = get_memory_storage_kind( + get_pointer_type(l0_engine()->context(), handle)); + + return status::success; +} + +bool memory_storage_t::is_host_accessible() const { + return utils::one_of(kind_, memory_storage_kind_t::host, + memory_storage_kind_t::shared, memory_storage_kind_t::unknown); +} + +struct map_usm_tag; + +status_t memory_storage_t::map_data( + void **mapped_ptr, impl::stream_t *stream, size_t size) const { + if (is_host_accessible()) { + *mapped_ptr = ptr(); + return status::success; + } + + if (!ptr() || size == 0) { + *mapped_ptr = nullptr; + return status::success; + } + + if (!stream) CHECK(engine()->get_service_stream(stream)); + + void *host_ptr = malloc_host(size); + if (!host_ptr) return status::out_of_memory; + + auto leak_guard = decltype(ptr_)(host_ptr, [this](void *p) { free(p); }); + CHECK(memcpy(stream, host_ptr, ptr(), size)); + CHECK(stream->wait()); + leak_guard.release(); + + auto *usm_ptr_for_unmap = ptr(); + auto unmap_callback = [size, usm_ptr_for_unmap, this]( + impl::stream_t *stream, void *mapped_ptr) { + CHECK(memcpy(stream, usm_ptr_for_unmap, mapped_ptr, size)); + CHECK(stream->wait()); + free(mapped_ptr); + + return status::success; + }; + + auto &map_manager = memory_map_manager_t::instance(); + + *mapped_ptr = host_ptr; + + return map_manager.map(this, stream, *mapped_ptr, unmap_callback); +} + +status_t memory_storage_t::unmap_data( + void *mapped_ptr, impl::stream_t *stream) const { + if (!mapped_ptr || is_host_accessible()) return status::success; + + if (!stream) CHECK(engine()->get_service_stream(stream)); + + auto &map_manager = memory_map_manager_t::instance(); + + return map_manager.unmap(this, stream, mapped_ptr); +} + +std::unique_ptr memory_storage_t::get_sub_storage( + size_t offset, size_t size) const { + void *sub_ptr + = ptr_ ? reinterpret_cast(ptr_.get()) + offset : nullptr; + + auto storage = utils::make_unique(engine(), kind_); + if (!storage) return nullptr; + + auto status = storage->init(memory_flags_t::use_runtime_ptr, size, sub_ptr); + if (status != status::success) return nullptr; + + // XXX: Clang has a bug that prevents implicit conversion. + return std::unique_ptr(storage.release()); +} + +std::unique_ptr memory_storage_t::clone() const { + auto storage = utils::make_unique(engine(), kind_); + if (!storage) return nullptr; + + auto status = storage->init(memory_flags_t::use_runtime_ptr, 0, nullptr); + if (status != status::success) return nullptr; + + storage->ptr_ = decltype(ptr_)(ptr_.get(), [](void *) {}); + storage->kind_ = kind_; + + // XXX: Clang has a bug that prevents implicit conversion. + return std::unique_ptr(storage.release()); +} + +status_t memory_storage_t::init_allocate(size_t size) { + if (kind_ == memory_storage_kind_t::unknown) + kind_ = memory_storage_kind_t::device; + + void *ptr_alloc = nullptr; + + switch (kind_) { + case memory_storage_kind_t::host: ptr_alloc = malloc_host(size); break; + case memory_storage_kind_t::device: + ptr_alloc = malloc_device(size); + break; + case memory_storage_kind_t::shared: + ptr_alloc = malloc_shared(size); + break; + default: break; + } + if (!ptr_alloc) return status::out_of_memory; + + ptr_ = decltype(ptr_)(ptr_alloc, [&](void *ptr) { free(ptr); }); + + return status::success; +} + +void *memory_storage_t::malloc_host(size_t size) const { + void *pptr = nullptr; + + ze_host_mem_alloc_desc_t host_mem_alloc_desc = {}; + host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_mem_alloc_desc.pNext = nullptr; + host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW; + + l0::zeMemAllocHost( + l0_engine()->context(), &host_mem_alloc_desc, size, 0, &pptr); + + return pptr; +} + +void *memory_storage_t::malloc_device(size_t size) const { + void *pptr = nullptr; + + ze_device_mem_alloc_desc_t device_mem_alloc_desc = {}; + device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + device_mem_alloc_desc.pNext = nullptr; + device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW; + device_mem_alloc_desc.ordinal = 0; + + l0::zeMemAllocDevice(l0_engine()->context(), &device_mem_alloc_desc, size, + 0, l0_engine()->device(), &pptr); + + return pptr; +} + +void *memory_storage_t::malloc_shared(size_t size) const { + void *pptr = nullptr; + + ze_device_mem_alloc_desc_t device_mem_alloc_desc = {}; + device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + device_mem_alloc_desc.pNext = nullptr; + device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW; + device_mem_alloc_desc.ordinal = 0; + + ze_host_mem_alloc_desc_t host_mem_alloc_desc = {}; + host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_mem_alloc_desc.pNext = nullptr; + host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW; + + l0::zeMemAllocShared(l0_engine()->context(), &device_mem_alloc_desc, + &host_mem_alloc_desc, size, 0, l0_engine()->device(), &pptr); + + return pptr; +} + +void memory_storage_t::free(void *ptr) const { + l0::zeMemFree(l0_engine()->context(), ptr); +} + +status_t memory_storage_t::memcpy( + impl::stream_t *stream, void *dst, const void *src, size_t size) const { + auto *l0_stream = utils::downcast(stream); + return l0::zeCommandListAppendMemoryCopy( + l0_stream->list(), dst, src, size, nullptr, 0, nullptr); +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/memory_storage.hpp b/src/gpu/intel/l0/memory_storage.hpp new file mode 100644 index 00000000000..ea994e9ce4f --- /dev/null +++ b/src/gpu/intel/l0/memory_storage.hpp @@ -0,0 +1,91 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_MEMORY_STORAGE_HPP +#define GPU_INTEL_L0_MEMORY_STORAGE_HPP + +#include + +#include "common/c_types_map.hpp" +#include "common/memory_storage.hpp" +#include "common/utils.hpp" + +#include "gpu/intel/l0/engine.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +enum class memory_storage_kind_t { unknown, host, device, shared }; +inline memory_storage_kind_t get_memory_storage_kind(ze_memory_type_t type) { + switch (type) { + case ZE_MEMORY_TYPE_HOST: return memory_storage_kind_t::host; + case ZE_MEMORY_TYPE_DEVICE: return memory_storage_kind_t::device; + case ZE_MEMORY_TYPE_SHARED: return memory_storage_kind_t::shared; + default: return memory_storage_kind_t::unknown; + } +} + +class memory_storage_t : public impl::memory_storage_t { +public: + memory_storage_t(impl::engine_t *engine, memory_storage_kind_t kind) + : impl::memory_storage_t(engine), kind_(kind) {} + + void *ptr() const { return ptr_.get(); } + + status_t get_data_handle(void **handle) const override; + status_t set_data_handle(void *handle) override; + + bool is_host_accessible() const override; + + status_t map_data(void **mapped_ptr, impl::stream_t *stream, + size_t size) const override; + status_t unmap_data( + void *mapped_ptr, impl::stream_t *stream) const override; + + std::unique_ptr get_sub_storage( + size_t offset, size_t size) const override; + std::unique_ptr clone() const override; + +private: + status_t init_allocate(size_t size) override; + + gpu::intel::l0::engine_t *l0_engine() const { + return utils::downcast(engine()); + } + + void *malloc_host(size_t size) const; + void *malloc_device(size_t size) const; + void *malloc_shared(size_t size) const; + void free(void *ptr) const; + status_t memcpy(impl::stream_t *stream, void *dst, const void *src, + size_t size) const; + + std::unique_ptr> ptr_; + memory_storage_kind_t kind_ = memory_storage_kind_t::unknown; + + DNNL_DISALLOW_COPY_AND_ASSIGN(memory_storage_t); +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_MEMORY_STORAGE_HPP diff --git a/src/gpu/intel/l0/stream.cpp b/src/gpu/intel/l0/stream.cpp new file mode 100644 index 00000000000..72392751570 --- /dev/null +++ b/src/gpu/intel/l0/stream.cpp @@ -0,0 +1,233 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/stream.hpp" +#include "gpu/intel/l0/engine.hpp" +#include "gpu/intel/l0/stream_profiler.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +status_t stream_t::create_stream(impl::stream_t **stream, + impl::engine_t *engine, impl::stream_impl_t *stream_impl) { + std::unique_ptr s( + new intel::l0::stream_t(engine, stream_impl)); + if (!s) return status::out_of_memory; + + *stream = s.release(); + + return status::success; +} + +stream_t::stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl) + : gpu::intel::stream_t(engine, stream_impl) { + if (is_profiling_enabled()) { + ze_device_properties_t device_properties = {}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2; + device_properties.pNext = nullptr; + + l0::zeDeviceGetProperties(utils::downcast(engine)->device(), + &device_properties); + profiler_ = utils::make_unique(this, + 1e9 / device_properties.timerResolution, + ~(-1L << device_properties.kernelTimestampValidBits)); + } +} + +void stream_t::before_exec_hook() { + if (is_profiling_enabled()) profiler_->start_profiling(); +} + +void stream_t::after_exec_hook() { + l0_ctx().set_deps(event_t()); + + if (is_profiling_enabled()) profiler_->stop_profiling(); +} + +status_t stream_t::reset_profiling() { + if (!is_profiling_enabled()) return status::invalid_arguments; + + profiler_->reset(); + + return status::success; +} + +status_t stream_t::get_profiling_data(profiling_data_kind_t data_kind, + int *num_entries, uint64_t *data) const { + if (!is_profiling_enabled()) return status::invalid_arguments; + + return profiler_->get_info(data_kind, num_entries, data); +} + +stream_impl_t::stream_impl_t(unsigned flags, ze_command_list_handle_t list) + : impl::stream_impl_t(flags) + , allocated_(false) + , list_(list) + , event_pool_(nullptr) { + l0::zeCommandListGetContextHandle(list_, &context_); + if (flags & stream_flags::out_of_order || is_profiling_enabled()) + create_event_pool(); +} + +stream_impl_t::stream_impl_t( + unsigned flags, ze_context_handle_t context, ze_device_handle_t device) + : impl::stream_impl_t(flags) + , context_(context) + , allocated_(true) + , event_pool_(nullptr) { + ze_command_queue_desc_t command_queue_desc = {}; + command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC; + command_queue_desc.pNext = nullptr; + command_queue_desc.ordinal = 0; + command_queue_desc.index = 0; + command_queue_desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT; + command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; + + l0::zeCommandListCreateImmediate( + context_, device, &command_queue_desc, &list_); + + if (flags & stream_flags::out_of_order || is_profiling_enabled()) + create_event_pool(); +} + +void stream_impl_t::create_event_pool() { + ze_event_pool_desc_t event_pool_desc = {}; + event_pool_desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC; + event_pool_desc.pNext = nullptr; + event_pool_desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + if (is_profiling_enabled()) + event_pool_desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + event_pool_desc.count = 16384; + + ze_event_pool_handle_t event_pool; + l0::zeEventPoolCreate(context_, &event_pool_desc, 0, nullptr, &event_pool); + event_pool_ = std::make_shared(event_pool); +} + +stream_impl_t::~stream_impl_t() { + wait(); + if (allocated_) l0::zeCommandListDestroy(list_); +} + +xpu::context_t &stream_impl_t::ctx() { + return l0_ctx(); +} + +const xpu::context_t &stream_impl_t::ctx() const { + return l0_ctx(); +} + +context_t &stream_impl_t::l0_ctx() { + const context_t &ctx = const_cast(this)->l0_ctx(); + return *const_cast(&ctx); +} + +const context_t &stream_impl_t::l0_ctx() const { + static context_t empty_ctx; + return ctx_.get(empty_ctx); +} + +ze_event_handle_t stream_impl_t::get_output_event() const { + auto &deps = event_t::from(ctx().get_deps()).events_; + if (deps.size()) return deps[0]; + + return nullptr; +} + +std::shared_ptr stream_impl_t::create_event() { + if (!event_pool_.get()) return std::make_shared(nullptr); + + ze_event_desc_t event_desc = {}; + event_desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC; + event_desc.pNext = nullptr; + event_desc.index = static_cast(events_.size()); + event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_event_handle_t event; + l0::zeEventCreate(*(event_pool_.get()), &event_desc, &event); + + std::shared_ptr event_ptr + = std::make_shared(event); + events_.push_back(event_ptr); + + return event_ptr; +} + +std::shared_ptr stream_impl_t::get_event_pool() { + return event_pool_; +} + +ze_command_list_handle_t stream_impl_t::list() { + return list_; +} + +status_t stream_impl_t::wait() { + CHECK(l0::zeCommandListHostSynchronize(list_, UINT64_MAX)); + + return status::success; +} + +status_t stream_impl_t::barrier() { + CHECK(l0::zeCommandListAppendBarrier(list_, nullptr, 0, nullptr)); + + return status::success; +} + +status_t stream_impl_t::copy(const impl::memory_storage_t &src, + const impl::memory_storage_t &dst, size_t size, + const xpu::event_t &deps, xpu::event_t &out_dep) { + if (size == 0) return status::success; + std::vector l0_deps + = utils::downcast(&deps)->events_; + + ze_event_handle_t out_event = *(create_event().get()); + CHECK(l0::zeCommandListAppendMemoryCopy(list_, dst.data_handle(), + src.data_handle(), size, out_event, + static_cast(l0_deps.size()), + l0_deps.size() ? l0_deps.data() : nullptr)); + if (out_event) + utils::downcast(&out_dep)->events_.push_back(out_event); + + return status::success; +} + +status_t stream_impl_t::fill(const impl::memory_storage_t &dst, uint8_t pattern, + size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) { + if (size == 0) return status::success; + std::vector l0_deps + = utils::downcast(&deps)->events_; + + ze_event_handle_t out_event = *(create_event().get()); + CHECK(l0::zeCommandListAppendMemoryFill(list_, dst.data_handle(), &pattern, + sizeof(pattern), size, out_event, + static_cast(l0_deps.size()), + l0_deps.size() ? l0_deps.data() : nullptr)); + if (out_event) + utils::downcast(&out_dep)->events_.push_back(out_event); + + return status::success; +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/stream.hpp b/src/gpu/intel/l0/stream.hpp new file mode 100644 index 00000000000..f3e70457520 --- /dev/null +++ b/src/gpu/intel/l0/stream.hpp @@ -0,0 +1,136 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_STREAM_HPP +#define GPU_INTEL_L0_STREAM_HPP + +#include + +#include "common/thread_local_storage.hpp" +#include "gpu/intel/l0/context.hpp" +#include "gpu/intel/l0/utils/utils.hpp" +#include "gpu/intel/stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +class stream_impl_t : public impl::stream_impl_t { +public: + stream_impl_t(unsigned flags, ze_command_list_handle_t list); + stream_impl_t(unsigned flags, ze_context_handle_t context, + ze_device_handle_t device); + ~stream_impl_t(); + + context_t &l0_ctx(); + const context_t &l0_ctx() const; + xpu::context_t &ctx(); + const xpu::context_t &ctx() const; + ze_event_handle_t get_output_event() const; + std::shared_ptr create_event(); + std::shared_ptr get_event_pool(); + + ze_command_list_handle_t list(); + + status_t wait(); + status_t barrier(); + + status_t copy(const impl::memory_storage_t &src, + const impl::memory_storage_t &dst, size_t size, + const xpu::event_t &deps, xpu::event_t &out_dep); + status_t fill(const impl::memory_storage_t &dst, uint8_t pattern, + size_t size, const xpu::event_t &deps, xpu::event_t &out_dep); + +private: + void create_event_pool(); + + ze_context_handle_t context_; + bool allocated_; + ze_command_list_handle_t list_; + + std::shared_ptr event_pool_; + std::list> events_; + + mutable utils::thread_local_storage_t ctx_; + + stream_impl_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(stream_impl_t); +}; + +class stream_t : public intel::stream_t { +public: + static status_t create_stream(impl::stream_t **stream, + impl::engine_t *engine, impl::stream_impl_t *stream_impl); + + stream_impl_t *impl() const { + return static_cast(impl::stream_t::impl_.get()); + } + + engine_t *l0_engine() const { + return utils::downcast(engine()); + } + + context_t &l0_ctx() { return impl()->l0_ctx(); } + const context_t &l0_ctx() const { return impl()->l0_ctx(); } + xpu::context_t &ctx() override { return impl()->ctx(); } + const xpu::context_t &ctx() const override { return impl()->ctx(); } + ze_event_handle_t get_output_event() const { + return impl()->get_output_event(); + } + std::shared_ptr create_event() { + return impl()->create_event(); + } + std::shared_ptr get_event_pool() { + return impl()->get_event_pool(); + } + + const ze_command_list_handle_t list() const { return impl()->list(); } + + status_t wait() override { return impl()->wait(); } + status_t barrier() override { return impl()->barrier(); } + + void before_exec_hook() override; + void after_exec_hook() override; + status_t reset_profiling() override; + status_t get_profiling_data(profiling_data_kind_t data_kind, + int *num_entries, uint64_t *data) const override; + + status_t copy(const impl::memory_storage_t &src, + const impl::memory_storage_t &dst, size_t size, + const xpu::event_t &deps, xpu::event_t &out_dep) override { + return impl()->copy(src, dst, size, deps, out_dep); + } + status_t fill(const impl::memory_storage_t &dst, uint8_t pattern, + size_t size, const xpu::event_t &deps, + xpu::event_t &out_dep) override { + return impl()->fill(dst, pattern, size, deps, out_dep); + } + +private: + stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl); + stream_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(stream_t); +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_STREAM_HPP diff --git a/src/gpu/intel/l0/stream_profiler.hpp b/src/gpu/intel/l0/stream_profiler.hpp new file mode 100644 index 00000000000..f872732b41c --- /dev/null +++ b/src/gpu/intel/l0/stream_profiler.hpp @@ -0,0 +1,140 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_STREAM_PROFILER_HPP +#define GPU_INTEL_L0_STREAM_PROFILER_HPP + +#include + +#include "gpu/intel/l0/context.hpp" +#include "xpu/stream_profiler.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +class stream_profiler_t : public xpu::stream_profiler_t { +public: + class entry_t { + public: + entry_t() = delete; + + entry_t(ze_kernel_timestamp_result_t &kernel_timestamp_result, + uint64_t max_timestamp_value, double timestamp_freq) + : context_(get_timestamp( + kernel_timestamp_result.context, max_timestamp_value)) + , freq_(timestamp_freq) {} + + uint64_t get_cycles() const { return context_; } + + uint64_t get_nsec() const { return get_cycles() * freq_; } + + private: + uint64_t get_timestamp( + ze_kernel_timestamp_data_t &ts, uint64_t max_timestamp_value) { + return (ts.kernelEnd >= ts.kernelStart) + ? (ts.kernelEnd - ts.kernelStart) + : ((max_timestamp_value - ts.kernelStart) + ts.kernelEnd + + 1); + } + + uint64_t context_; + double freq_; + }; + + stream_profiler_t(const impl::stream_t *stream, double timestamp_freq, + uint64_t max_timestamp_value) + : xpu::stream_profiler_t(stream) + , timestamp_freq_(timestamp_freq) + , max_timestamp_value_(max_timestamp_value) {} + + status_t get_info(profiling_data_kind_t data_kind, int *num_entries, + uint64_t *data) const override { + if (!num_entries) return status::invalid_arguments; + + bool is_per_kernel + = (data_kind == profiling_data_kind::time_per_kernel); + if (!data) { + if (is_per_kernel) { + *num_entries = (int)events_.size(); + return status::success; + } + std::unordered_set seen; + for (auto &ev : events_) + seen.insert(ev.stamp); + *num_entries = (int)seen.size(); + return status::success; + } + + std::map stamp2entry; + int idx = 0; + for (auto &ev : events_) { + const l0::event_t &l0_event + = *utils::downcast(ev.event.get()); + + ze_kernel_timestamp_result_t kernel_timestamp_result; + CHECK(l0::zeEventQueryKernelTimestamp( + l0_event[0], &kernel_timestamp_result)); + + entry_t entry(kernel_timestamp_result, max_timestamp_value_, + timestamp_freq_); + if (is_per_kernel) { + data[idx++] = entry.get_nsec(); + continue; + } + stamp2entry.emplace(ev.stamp, entry); + } + if (is_per_kernel) return status::success; + + return get_info_impl(stamp2entry, data_kind, data); + } + +private: + stream_profiler_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(stream_profiler_t); + + status_t get_info_impl(const std::map &stamp2entry, + profiling_data_kind_t data_kind, uint64_t *data) const { + int idx = 0; + for (auto &kv : stamp2entry) { + auto &e = kv.second; + switch ((int)data_kind) { + case profiling_data_kind::time: data[idx] = e.get_nsec(); break; + case profiling_data_kind::cycles: { + data[idx] = e.get_cycles(); + if (callback_) callback_(kv.first, e.get_nsec()); + break; + } + default: assert(!"unexpected data kind"); + } + idx++; + } + return status::success; + } + + double timestamp_freq_; + uint64_t max_timestamp_value_; +}; + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_STREAM_PROFILER_HPP diff --git a/src/gpu/intel/l0/utils/CMakeLists.txt b/src/gpu/intel/l0/utils/CMakeLists.txt new file mode 100644 index 00000000000..a4bd6d8c3a5 --- /dev/null +++ b/src/gpu/intel/l0/utils/CMakeLists.txt @@ -0,0 +1,24 @@ +#=============================================================================== +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB_RECURSE SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp + ) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0_utils) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/gpu/intel/l0/utils/utils.cpp b/src/gpu/intel/l0/utils/utils.cpp new file mode 100644 index 00000000000..2c2af1ccdd4 --- /dev/null +++ b/src/gpu/intel/l0/utils/utils.cpp @@ -0,0 +1,408 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/l0/utils/utils.hpp" +#include "gpu/intel/jit/binary_format.hpp" +#include "gpu/intel/jit/utils/type_bridge.hpp" +#include "ngen_level_zero.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +event_wrapper_t::event_wrapper_t(ze_event_handle_t event) : event_(event) {} + +event_wrapper_t::~event_wrapper_t() { + if (event_) { + l0::zeEventHostSynchronize(event_, UINT64_MAX); + l0::zeEventDestroy(event_); + } +} + +event_wrapper_t::operator ze_event_handle_t() const { + return event_; +} + +event_pool_wrapper_t::event_pool_wrapper_t(ze_event_pool_handle_t event_pool) + : event_pool_(event_pool) {} + +event_pool_wrapper_t::~event_pool_wrapper_t() { + if (event_pool_) l0::zeEventPoolDestroy(event_pool_); +} + +event_pool_wrapper_t::operator ze_event_pool_handle_t() const { + return event_pool_; +} + +module_wrapper_t::module_wrapper_t(ze_module_handle_t module) + : module_(module) {} + +module_wrapper_t::~module_wrapper_t() { + if (module_) l0::zeModuleDestroy(module_); +} + +module_wrapper_t::operator ze_module_handle_t() const { + return module_; +} + +status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) { + ze_device_ip_version_ext_t device_ip_version_ext = {}; + device_ip_version_ext.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT; + device_ip_version_ext.pNext = nullptr; + + ze_device_properties_t device_properties = {}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + device_properties.pNext = &device_ip_version_ext; + + CHECK(l0::zeDeviceGetProperties(device, &device_properties)); + + ip_version = device_ip_version_ext.ipVersion; + + return status::success; +} + +status_t get_l0_device_enabled_systolic_intel( + ze_device_handle_t device, bool &mayiuse_systolic) { + ze_intel_device_module_dp_exp_properties_t + intel_device_module_dp_exp_properties + = {}; + intel_device_module_dp_exp_properties.stype + = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES; + intel_device_module_dp_exp_properties.pNext = nullptr; + + ze_device_module_properties_t device_module_properties = {}; + device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES; + device_module_properties.pNext = &intel_device_module_dp_exp_properties; + + CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties)); + + mayiuse_systolic = intel_device_module_dp_exp_properties.flags + & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS; + + return status::success; +} + +status_t get_l0_device_enabled_native_float_atomics( + ze_device_handle_t device, uint64_t &native_extensions) { + using namespace gpu::intel::compute; + + ze_float_atomic_ext_properties_t float_atomic_ext_properties = {}; + float_atomic_ext_properties.stype + = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES; + float_atomic_ext_properties.pNext = nullptr; + + ze_device_module_properties_t device_module_properties = {}; + device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES; + device_module_properties.pNext = &float_atomic_ext_properties; + + CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties)); + + ze_device_fp_atomic_ext_flags_t atomic_load_store + = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE + | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE; + ze_device_fp_atomic_ext_flags_t atomic_add + = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD + | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD; + ze_device_fp_atomic_ext_flags_t atomic_min_max + = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX + | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX; + + if ((float_atomic_ext_properties.fp16Flags & atomic_load_store) + == atomic_load_store) + native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store; + if ((float_atomic_ext_properties.fp16Flags & atomic_add) == atomic_add) + native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add; + if ((float_atomic_ext_properties.fp16Flags & atomic_min_max) + == atomic_min_max) + native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max; + + if ((float_atomic_ext_properties.fp32Flags & atomic_load_store) + == atomic_load_store) + native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store; + if ((float_atomic_ext_properties.fp32Flags & atomic_add) == atomic_add) + native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add; + if ((float_atomic_ext_properties.fp32Flags & atomic_min_max) + == atomic_min_max) + native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max; + + if ((float_atomic_ext_properties.fp64Flags & atomic_load_store) + == atomic_load_store) + native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store; + if ((float_atomic_ext_properties.fp64Flags & atomic_add) == atomic_add) + native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add; + if ((float_atomic_ext_properties.fp64Flags & atomic_min_max) + == atomic_min_max) + native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max; + + return status::success; +} + +status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) { + ze_eu_count_ext_t eu_count_ext = {}; + eu_count_ext.stype = ZE_STRUCTURE_TYPE_EU_COUNT_EXT; + eu_count_ext.pNext = nullptr; + + ze_device_properties_t device_properties = {}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + device_properties.pNext = &eu_count_ext; + + CHECK(l0::zeDeviceGetProperties(device, &device_properties)); + + eu_count = eu_count_ext.numTotalEUs; + + return status::success; +} + +status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device, + ze_context_handle_t context, uint32_t &ip_version, + compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_, + uint64_t &native_extensions, bool &mayiuse_systolic, + bool &mayiuse_ngen_kernels) { + using namespace ngen; + ngen::Product product = LevelZeroCodeGenerator::detectHWInfo( + context, device); + + gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family)); + std::memcpy(&product_, &product, sizeof(ngen::Product)); + + mayiuse_systolic = false; + if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic) + != status::success) + mayiuse_systolic = false; + + /* Some old drivers do not report systolic availability. Manually override + systolic availability based on product family. */ + switch (product.family) { + case ProductFamily::DG2: + case ProductFamily::ARL: + case ProductFamily::PVC: mayiuse_systolic = true; + default: break; + } + + CHECK(get_l0_device_enabled_native_float_atomics( + device, native_extensions)); + + auto status + = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine); + if (status != status::success) mayiuse_ngen_kernels = false; + + ip_version = 0; + + return get_device_ip(device, ip_version); +} + +xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device) { + static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16, + "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16"); + + ze_device_properties_t device_properties = {}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + device_properties.pNext = nullptr; + + auto status = l0::zeDeviceGetProperties(device, &device_properties); + MAYBE_UNUSED(status); + assert(status == status::success); + + const auto &device_id = device_properties.uuid.id; + + uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {}; + for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) { + size_t shift = i % sizeof(uint64_t) * CHAR_BIT; + uuid[i / sizeof(uint64_t)] |= (((uint64_t)device_id[i]) << shift); + } + + return xpu::device_uuid_t(uuid[0], uuid[1]); +} + +status_t get_device_index(const ze_device_handle_t device, size_t *index) { + uint32_t driver_count = 0; + CHECK(l0::zeDriverGet(&driver_count, nullptr)); + + std::vector drivers(driver_count); + CHECK(l0::zeDriverGet(&driver_count, drivers.data())); + + uint32_t device_count = 0; + CHECK(l0::zeDeviceGet(drivers[0], &device_count, nullptr)); + + std::vector devices(device_count); + CHECK(l0::zeDeviceGet(drivers[0], &device_count, devices.data())); + + for (size_t i = 0; i < device_count; i++) { + if (device == devices[i]) { + *index = i; + + return status::success; + } + } + + return status::invalid_arguments; +} + +std::string get_kernel_name(const ze_kernel_handle_t kernel) { + std::string kernel_name; + + size_t kernel_name_size = 0; + l0::zeKernelGetName(kernel, &kernel_name_size, nullptr); + + kernel_name.resize(kernel_name_size, 0); + l0::zeKernelGetName(kernel, &kernel_name_size, &kernel_name[0]); + + // Remove the null terminator as std::string already includes it + kernel_name.resize(kernel_name_size - 1); + + return kernel_name; +} + +status_t get_module_binary( + const ze_module_handle_t module, xpu::binary_t &binary) { + size_t module_binary_size; + CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr)); + + binary.resize(module_binary_size); + CHECK(l0::zeModuleGetNativeBinary( + module, &module_binary_size, binary.data())); + + return status::success; +} + +status_t get_kernel_binary( + const ze_kernel_handle_t kernel, xpu::binary_t &binary) { + size_t binary_size = 0; + CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, nullptr)); + + binary.resize(binary_size); + CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, binary.data())); + + return status::success; +} + +static inline ze_result_t func_zeModuleCreate(ze_context_handle_t hContext, + ze_device_handle_t hDevice, const ze_module_desc_t *desc, + ze_module_handle_t *phModule, + ze_module_build_log_handle_t *phBuildLog) { + const ze_init_flags_t default_ze_flags = 0; + static auto init_ = find_ze_symbol("zeInit"); + if (!init_) return ZE_RESULT_ERROR_NOT_AVAILABLE; + init_(default_ze_flags); + + static auto f_ + = find_ze_symbol("zeModuleCreate"); + if (!f_) return ZE_RESULT_ERROR_NOT_AVAILABLE; + return f_(hContext, hDevice, desc, phModule, phBuildLog); +} + +#define ZE_MODULE_FORMAT_OCLC (ze_module_format_t)3U +static inline ze_module_handle_t compile_ocl_module( + const ze_device_handle_t device, const ze_context_handle_t context, + const std::string &code, const std::string &options) { + ze_module_desc_t module_desc; + module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; + module_desc.pNext = nullptr; + module_desc.format = ZE_MODULE_FORMAT_OCLC; + module_desc.inputSize = code.size(); + module_desc.pInputModule = reinterpret_cast(code.c_str()); + module_desc.pBuildFlags = options.c_str(); + module_desc.pConstants = nullptr; + + ze_module_handle_t module_handle; + ze_module_build_log_handle_t module_build_log_handle; + ze_result_t ret = func_zeModuleCreate(context, device, &module_desc, + &module_handle, &module_build_log_handle); + if (ret != ZE_RESULT_SUCCESS) return nullptr; + return module_handle; +} + +bool mayiuse_microkernels(const ze_device_handle_t device, + const ze_context_handle_t context, const std::string &code) { + ze_module_handle_t module_handle + = compile_ocl_module(device, context, code, ""); + if (module_handle) { + l0::zeModuleDestroy(module_handle); + return true; + } + return false; +} + +status_t compile_ocl_module_to_binary(const ze_device_handle_t device, + const ze_context_handle_t context, const std::string &code, + const std::string &options, xpu::binary_t &binary) { + ze_module_handle_t module_handle + = compile_ocl_module(device, context, code, options); + if (!module_handle) { return status::runtime_error; } + CHECK(l0::get_module_binary(module_handle, binary)); + CHECK(l0::zeModuleDestroy(module_handle)); + + return status::success; +} + +status_t create_kernels(const ze_device_handle_t device, + const ze_context_handle_t context, + const std::vector &kernel_names, + const xpu::binary_t &binary, ze_module_handle_t *module, + std::vector &kernels) { + ze_module_desc_t module_desc; + module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; + module_desc.pNext = nullptr; + module_desc.format = ZE_MODULE_FORMAT_NATIVE; + module_desc.inputSize = binary.size(); + module_desc.pInputModule = binary.data(); + module_desc.pBuildFlags = ""; + module_desc.pConstants = nullptr; + + CHECK(l0::zeModuleCreate(context, device, &module_desc, module, nullptr)); + + kernels.resize(kernel_names.size(), nullptr); + for (size_t i = 0; i < kernel_names.size(); i++) { + if (kernel_names[i] == nullptr) { + kernels[i] = nullptr; + continue; + } + + ze_kernel_desc_t kernel_desc = {}; + kernel_desc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; + kernel_desc.pNext = nullptr; + kernel_desc.flags = 0; + kernel_desc.pKernelName = kernel_names[i]; + + ze_kernel_handle_t kernel; + CHECK(l0::zeKernelCreate(*module, &kernel_desc, &kernel)); + + kernels[i] = kernel; + } + + return status::success; +} + +ze_memory_type_t get_pointer_type( + const ze_context_handle_t context, const void *ptr) { + ze_memory_allocation_properties_t memory_allocation_properties; + memory_allocation_properties.stype + = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES; + memory_allocation_properties.pNext = nullptr; + + l0::zeMemGetAllocProperties( + context, ptr, &memory_allocation_properties, nullptr); + + return memory_allocation_properties.type; +} + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/l0/utils/utils.hpp b/src/gpu/intel/l0/utils/utils.hpp new file mode 100644 index 00000000000..efd47be85dc --- /dev/null +++ b/src/gpu/intel/l0/utils/utils.hpp @@ -0,0 +1,234 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_L0_UTILS_UTILS_HPP +#define GPU_INTEL_L0_UTILS_UTILS_HPP + +#if defined(__linux__) +#include +#elif defined(_WIN32) +#include "windows.h" +#else +#error "Level Zero is supported on Linux and Windows only" +#endif + +#include "gpu/intel/compute/kernel.hpp" + +#include "level_zero/ze_api.h" +#include "level_zero/ze_intel_gpu.h" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace l0 { + +static inline std::string to_string(ze_result_t r) { +#define ZE_STATUS_CASE(status) \ + case status: return #status + switch (r) { + ZE_STATUS_CASE(ZE_RESULT_SUCCESS); + ZE_STATUS_CASE(ZE_RESULT_NOT_READY); + ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST); + ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY); + ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY); + ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET); + ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); + ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED); + ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE); + ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS); + ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN); + ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32); + default: return std::to_string((int)r); + } +#undef ZE_STATUS_CASE +} + +#define ZE_CHECK(f) \ + do { \ + ze_result_t res_ = (f); \ + if (res_ != ZE_RESULT_SUCCESS) { \ + std::string err_str_ = to_string(res_); \ + VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \ + return status::runtime_error; \ + } \ + } while (false) + +#if defined(_WIN32) +#define L0_LIB_NAME "ze_loader.dll" +#elif defined(__linux__) +#define L0_LIB_NAME "libze_loader.so.1" +#endif + +template +F find_ze_symbol(const char *symbol) { + return (F)xpu::find_symbol(L0_LIB_NAME, symbol); +} +#undef L0_LIB_NAME + +#define INDIRECT_L0_CALL(f) \ + template \ + status_t f(Args &&...args) { \ + const ze_init_flags_t default_ze_flags = 0; \ + static auto init_ = find_ze_symbol("zeInit"); \ + if (!init_) return status::runtime_error; \ + ZE_CHECK(init_(default_ze_flags)); \ + static auto f_ = find_ze_symbol(#f); \ + if (!f_) return status::runtime_error; \ + ZE_CHECK(f_(std::forward(args)...)); \ + return status::success; \ + } +INDIRECT_L0_CALL(zeDriverGet) +INDIRECT_L0_CALL(zeDriverGetProperties) +INDIRECT_L0_CALL(zeDeviceGet) +INDIRECT_L0_CALL(zeDeviceGetProperties) +INDIRECT_L0_CALL(zeDeviceGetComputeProperties) +INDIRECT_L0_CALL(zeDeviceGetModuleProperties) +INDIRECT_L0_CALL(zeDeviceGetMemoryAccessProperties) +INDIRECT_L0_CALL(zeDeviceGetCacheProperties) +INDIRECT_L0_CALL(zeContextCreate) +INDIRECT_L0_CALL(zeContextDestroy) +INDIRECT_L0_CALL(zeCommandListCreateImmediate) +INDIRECT_L0_CALL(zeCommandListDestroy) +INDIRECT_L0_CALL(zeCommandListHostSynchronize) +INDIRECT_L0_CALL(zeCommandListGetContextHandle) +INDIRECT_L0_CALL(zeCommandListAppendBarrier) +INDIRECT_L0_CALL(zeCommandListAppendMemoryCopy) +INDIRECT_L0_CALL(zeCommandListAppendMemoryFill) +INDIRECT_L0_CALL(zeEventPoolCreate) +INDIRECT_L0_CALL(zeEventPoolDestroy) +INDIRECT_L0_CALL(zeEventCreate) +INDIRECT_L0_CALL(zeEventDestroy) +INDIRECT_L0_CALL(zeEventHostSynchronize) +INDIRECT_L0_CALL(zeEventQueryKernelTimestamp) +INDIRECT_L0_CALL(zeMemAllocShared) +INDIRECT_L0_CALL(zeMemAllocDevice) +INDIRECT_L0_CALL(zeMemAllocHost) +INDIRECT_L0_CALL(zeMemFree) +INDIRECT_L0_CALL(zeMemGetAllocProperties) +INDIRECT_L0_CALL(zeModuleCreate) +INDIRECT_L0_CALL(zeModuleDestroy) +INDIRECT_L0_CALL(zeModuleBuildLogGetString) +INDIRECT_L0_CALL(zeModuleBuildLogDestroy) +INDIRECT_L0_CALL(zeModuleGetNativeBinary) +INDIRECT_L0_CALL(zeKernelCreate) +INDIRECT_L0_CALL(zeKernelDestroy) +INDIRECT_L0_CALL(zeKernelSetArgumentValue) +INDIRECT_L0_CALL(zeKernelGetName) +INDIRECT_L0_CALL(zeKernelGetBinaryExp) +INDIRECT_L0_CALL(zeKernelSetGroupSize) +INDIRECT_L0_CALL(zeKernelSuggestGroupSize) +INDIRECT_L0_CALL(zeCommandListAppendLaunchKernel) +#undef INDIRECT_L0_CALL + +class event_wrapper_t { +public: + event_wrapper_t(ze_event_handle_t event); + ~event_wrapper_t(); + operator ze_event_handle_t() const; + +private: + ze_event_handle_t event_; + + event_wrapper_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(event_wrapper_t); +}; + +class event_pool_wrapper_t { +public: + event_pool_wrapper_t(ze_event_pool_handle_t event_pool); + ~event_pool_wrapper_t(); + operator ze_event_pool_handle_t() const; + +private: + ze_event_pool_handle_t event_pool_; + + event_pool_wrapper_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(event_pool_wrapper_t); +}; + +class module_wrapper_t { +public: + module_wrapper_t(ze_module_handle_t module); + ~module_wrapper_t(); + operator ze_module_handle_t() const; + +private: + ze_module_handle_t module_; + + module_wrapper_t() = delete; + DNNL_DISALLOW_COPY_AND_ASSIGN(module_wrapper_t); +}; + +status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device, + ze_context_handle_t context, uint32_t &ip_version, + compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product, + uint64_t &native_extensions, bool &mayiuse_systolic, + bool &mayiuse_ngen_kernels); +xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device); +status_t get_device_index(const ze_device_handle_t device, size_t *index); +std::string get_kernel_name(const ze_kernel_handle_t kernel); +status_t get_module_binary( + const ze_module_handle_t module, xpu::binary_t &binary); +status_t get_kernel_binary( + const ze_kernel_handle_t kernel, xpu::binary_t &binary); +bool mayiuse_microkernels(const ze_device_handle_t device, + const ze_context_handle_t context, const std::string &code); +status_t compile_ocl_module_to_binary(const ze_device_handle_t device, + const ze_context_handle_t context, const std::string &code, + const std::string &options, xpu::binary_t &binary); +status_t create_kernels(const ze_device_handle_t device, + const ze_context_handle_t context, + const std::vector &kernel_names, + const xpu::binary_t &binary, ze_module_handle_t *module, + std::vector &kernels); +ze_memory_type_t get_pointer_type(const ze_context_handle_t, const void *ptr); + +} // namespace l0 +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif // GPU_INTEL_L0_UTILS_UTILS_HPP diff --git a/src/gpu/intel/ocl/CMakeLists.txt b/src/gpu/intel/ocl/CMakeLists.txt index b5e057a405e..2423aee56c0 100644 --- a/src/gpu/intel/ocl/CMakeLists.txt +++ b/src/gpu/intel/ocl/CMakeLists.txt @@ -15,9 +15,7 @@ #=============================================================================== file(GLOB_RECURSE SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/*.h ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.c ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ) diff --git a/src/gpu/intel/ocl/device_info.cpp b/src/gpu/intel/ocl/device_info.cpp index 7273a058c81..e55be6d92f5 100644 --- a/src/gpu/intel/ocl/device_info.cpp +++ b/src/gpu/intel/ocl/device_info.cpp @@ -92,19 +92,11 @@ status_t device_info_t::init_runtime_version(impl::engine_t *engine) { } status_t device_info_t::init_extensions(impl::engine_t *engine) { - cl_int err = CL_SUCCESS; auto device = utils::downcast(engine)->device(); // query device for extensions - size_t param_size = 0; - err = xpu::ocl::clGetDeviceInfo( - device, CL_DEVICE_EXTENSIONS, 0, nullptr, ¶m_size); - OCL_CHECK(err); - - std::string extension_string(param_size, '\0'); - err = xpu::ocl::clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_size, - &extension_string[0], ¶m_size); - OCL_CHECK(err); + std::string extension_string; + CHECK(xpu::ocl::get_extensions(device, extension_string)); // convert to ours using namespace compute; diff --git a/src/gpu/intel/ocl/device_info.hpp b/src/gpu/intel/ocl/device_info.hpp index eb43584810d..eeb55b70951 100644 --- a/src/gpu/intel/ocl/device_info.hpp +++ b/src/gpu/intel/ocl/device_info.hpp @@ -22,7 +22,7 @@ #include #include "gpu/intel/compute/device_info.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/intel/ocl/engine.cpp b/src/gpu/intel/ocl/engine.cpp index c3bf7b991b3..a3cbbdfbd7a 100644 --- a/src/gpu/intel/ocl/engine.cpp +++ b/src/gpu/intel/ocl/engine.cpp @@ -31,7 +31,7 @@ #include "gpu/intel/ocl/device_info.hpp" #include "gpu/intel/ocl/kernel.hpp" #include "gpu/intel/ocl/stream.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" namespace dnnl { namespace impl { @@ -39,33 +39,6 @@ namespace gpu { namespace intel { namespace ocl { -status_t preprocess_headers(stringstream_t &pp_code, const char *code, - const compute::kernel_ctx_t &kernel_ctx) { - stringstream_t code_stream(code); - - for (std::string line; std::getline(code_stream, line);) { - const size_t include_pos = line.find("#include"); - if (include_pos != std::string::npos) { - static constexpr size_t include_len = 8; - const size_t first_quote_pos - = line.find("\"", include_pos + include_len); - const size_t second_quote_pos - = line.find("\"", first_quote_pos + 1); - const size_t kernel_name_len - = second_quote_pos - first_quote_pos - 1; - const auto header_name - = line.substr(first_quote_pos + 1, kernel_name_len); - const char *header_source - = kernel_ctx.get_custom_header(header_name); - if (!header_source) header_source = get_kernel_header(header_name); - CHECK(preprocess_headers(pp_code, header_source, kernel_ctx)); - } else { - pp_code << line << std::endl; - } - } - return status::success; -} - status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind, cl_device_id dev, cl_context ctx, size_t index, const std::vector &cache_blob) { @@ -331,14 +304,14 @@ status_t engine_t::build_program_from_source( // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of // that, a manual preprocessing of `include` header directives in the // OpenCL kernels is required. - CHECK(preprocess_headers(pp_code, code_string, kernel_ctx)); + CHECK(compute::preprocess_headers(pp_code, code_string, kernel_ctx)); std::string pp_code_str = pp_code.str(); const char *pp_code_str_ptr = pp_code_str.c_str(); src = {pp_code_str}; if (src) { options += " -g -s " + std::string(src.name()); } - debugdump_processed_source( + compute::debugdump_processed_source( pp_code_str, options, dev_info->get_cl_ext_options()); auto ctx = context(); diff --git a/src/gpu/intel/ocl/engine.hpp b/src/gpu/intel/ocl/engine.hpp index 7d4273dc7c9..00543c476f7 100644 --- a/src/gpu/intel/ocl/engine.hpp +++ b/src/gpu/intel/ocl/engine.hpp @@ -21,6 +21,7 @@ #include "common/utils.hpp" #include "gpu/gpu_impl_list.hpp" #include "gpu/intel/engine.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "xpu/ocl/engine_impl.hpp" #include "xpu/utils.hpp" @@ -30,9 +31,6 @@ namespace gpu { namespace intel { namespace ocl { -status_t preprocess_headers(stringstream_t &pp_code, const char *code, - const compute::kernel_ctx_t &kernel_ctx); - status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind, cl_device_id dev, cl_context ctx, size_t index, const std::vector &cache_blob); diff --git a/src/gpu/intel/ocl/hw_info.cpp b/src/gpu/intel/ocl/hw_info.cpp index 7872e1b2abb..187687276dd 100644 --- a/src/gpu/intel/ocl/hw_info.cpp +++ b/src/gpu/intel/ocl/hw_info.cpp @@ -15,7 +15,7 @@ *******************************************************************************/ #include "gpu/intel/ocl/hw_info.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "gpu/intel/jit/binary_format.hpp" #include "gpu/intel/jit/generator.hpp" diff --git a/src/gpu/intel/ocl/kernel.cpp b/src/gpu/intel/ocl/kernel.cpp index 7304338d01f..17bac4daaa3 100644 --- a/src/gpu/intel/ocl/kernel.cpp +++ b/src/gpu/intel/ocl/kernel.cpp @@ -31,7 +31,7 @@ #include "gpu/intel/ocl/engine.hpp" #include "gpu/intel/ocl/stream.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/intel/ocl/mdapi_utils.cpp b/src/gpu/intel/ocl/mdapi_utils.cpp index 888f1475acd..732a1897bee 100644 --- a/src/gpu/intel/ocl/mdapi_utils.cpp +++ b/src/gpu/intel/ocl/mdapi_utils.cpp @@ -28,7 +28,7 @@ #include #include -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "mdapi/metrics_discovery_api.h" #ifndef CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL diff --git a/src/gpu/intel/ocl/stream.cpp b/src/gpu/intel/ocl/stream.cpp index a086bd20084..213fdd76732 100644 --- a/src/gpu/intel/ocl/stream.cpp +++ b/src/gpu/intel/ocl/stream.cpp @@ -26,7 +26,7 @@ #include "gpu/intel/ocl/engine.hpp" #include "gpu/intel/ocl/stream.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/intel/ocl/stream.hpp b/src/gpu/intel/ocl/stream.hpp index 56530fbf072..1da17c0b553 100644 --- a/src/gpu/intel/ocl/stream.hpp +++ b/src/gpu/intel/ocl/stream.hpp @@ -26,9 +26,11 @@ #include "xpu/ocl/context.hpp" #include "xpu/ocl/stream_impl.hpp" -#include "gpu/intel/ocl/mdapi_utils.hpp" #include "gpu/intel/stream.hpp" +#include "gpu/intel/ocl/mdapi_utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/intel/ocl/utils/CMakeLists.txt b/src/gpu/intel/ocl/utils/CMakeLists.txt new file mode 100644 index 00000000000..9cca3dd28ac --- /dev/null +++ b/src/gpu/intel/ocl/utils/CMakeLists.txt @@ -0,0 +1,24 @@ +#=============================================================================== +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB_RECURSE SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp + ) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_ocl_utils) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/gpu/intel/ocl/utils.cpp b/src/gpu/intel/ocl/utils/utils.cpp similarity index 82% rename from src/gpu/intel/ocl/utils.cpp rename to src/gpu/intel/ocl/utils/utils.cpp index 5460334e45e..57e456fddc3 100644 --- a/src/gpu/intel/ocl/utils.cpp +++ b/src/gpu/intel/ocl/utils/utils.cpp @@ -23,7 +23,7 @@ #include "gpu/intel/ocl/engine.hpp" #include "gpu/intel/ocl/hw_info.hpp" #include "gpu/intel/ocl/kernel.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "xpu/ocl/utils.hpp" #ifndef CL_KERNEL_BINARY_PROGRAM_INTEL @@ -219,70 +219,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) { return status::success; } -void debugdump_processed_source(const std::string &source, - const std::string &options, const std::string &cl_options) { -#if defined(__linux__) && defined(DNNL_DEV_MODE) - if (get_verbose(verbose_t::debuginfo) >= 10) { - auto get_defines = [](const std::string &from) { - std::string ret; - size_t pos = 0; - while (pos < from.length()) { - // Find next define argument - pos = from.find("-D", pos); - - // Generate argument, quotes are interpreted literally, but - // other special shell characters need escaped. Does not - // currently handle quotes with the ' character or nested quotes - char quote_parity = true; - while (pos < from.length()) { - if (quote_parity - && utils::one_of(from[pos], '~', '#', '$', '&', '*', - '(', ')', '\\', '|', '[', ']', '{', '}', - ';', '\'', '<', '>', '/', '?', '!')) { - ret += '\\'; - } - ret += from[pos]; - if (from[pos] == '"') quote_parity ^= true; - if (from[pos] == ' ' && quote_parity) break; - - pos++; - } - } - return ret; - }; - auto execute_command - = [](const std::string &cmd, const std::string &stdin) { - std::string result; - std::array buffer; - FILE *pipe = popen(cmd.c_str(), "w"); - fputs(stdin.c_str(), pipe); - if (pipe) { - while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { - result += buffer.data(); - } - } - pclose(pipe); - return result; - }; - - // Run utilities to evaluate preprocessor defines and format the file - // Theoretically, we can accomplish this task with libclang, but it - // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG - // so that calls to the system are not included in the default build. - - // Due to the use of a different C preprocessor, warnings should not be - // ignored, as they may correspond to a different behavior in the OpenCL - // C preprocessor - auto o = get_defines(options) + get_defines(cl_options); - std::string preprocess_cmd - = std::string() + "cpp -P " + o + " | clang-format"; - execute_command(preprocess_cmd, source); - std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl; - std::cout << "OCL_OPTIONS: " << options << std::endl; - } -#endif -} - status_t get_kernel_arg_types(cl_kernel ocl_kernel, std::vector *arg_types) { cl_uint nargs; diff --git a/src/gpu/intel/ocl/utils.hpp b/src/gpu/intel/ocl/utils/utils.hpp similarity index 92% rename from src/gpu/intel/ocl/utils.hpp rename to src/gpu/intel/ocl/utils/utils.hpp index 09e4354536b..1746f2bb889 100644 --- a/src/gpu/intel/ocl/utils.hpp +++ b/src/gpu/intel/ocl/utils/utils.hpp @@ -14,8 +14,8 @@ * limitations under the License. *******************************************************************************/ -#ifndef GPU_INTEL_OCL_UTILS_HPP -#define GPU_INTEL_OCL_UTILS_HPP +#ifndef GPU_INTEL_OCL_UTILS_UTILS_HPP +#define GPU_INTEL_OCL_UTILS_UTILS_HPP #include #include @@ -55,9 +55,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary); status_t get_ocl_program_binary_size( cl_kernel kernel, cl_device_id device, size_t *size); -void debugdump_processed_source(const std::string &source, - const std::string &options, const std::string &ocl_options); - status_t get_kernel_arg_types(cl_kernel ocl_kernel, std::vector *arg_types); @@ -76,4 +73,4 @@ status_t get_ocl_device_enabled_native_float_atomics( } // namespace impl } // namespace dnnl -#endif +#endif // GPU_INTEL_OCL_UTILS_UTILS_HPP diff --git a/src/gpu/intel/sycl/compat.cpp b/src/gpu/intel/sycl/compat.cpp index 4227c23058c..d3c6c0613e8 100644 --- a/src/gpu/intel/sycl/compat.cpp +++ b/src/gpu/intel/sycl/compat.cpp @@ -35,7 +35,7 @@ #include "gpu/intel/compute/device_info.hpp" #include "gpu/intel/sycl/compat.hpp" #include "gpu/intel/sycl/engine.hpp" -#include "gpu/intel/sycl/l0/utils.hpp" +#include "gpu/intel/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/intel/sycl/device_info.cpp b/src/gpu/intel/sycl/device_info.cpp index 333db8b6372..99ac9da8706 100644 --- a/src/gpu/intel/sycl/device_info.cpp +++ b/src/gpu/intel/sycl/device_info.cpp @@ -19,11 +19,11 @@ #include "gpu/intel/sycl/compat.hpp" #include "gpu/intel/sycl/device_info.hpp" #include "gpu/intel/sycl/engine.hpp" -#include "gpu/intel/sycl/l0/utils.hpp" -#include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/ocl/hw_info.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" + +#include "gpu/intel/l0/utils/utils.hpp" namespace dnnl { namespace impl { @@ -56,7 +56,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) { auto ze_dev = xpu::sycl::compat::get_native(device); auto ze_ctx = xpu::sycl::compat::get_native(ctx); - status = gpu::intel::sycl::init_gpu_hw_info(engine, ze_dev, ze_ctx, + status = gpu::intel::l0::init_gpu_hw_info(engine, ze_dev, ze_ctx, ip_version_, gpu_arch_, gpu_product_, native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_); } else { diff --git a/src/gpu/intel/sycl/engine.cpp b/src/gpu/intel/sycl/engine.cpp index 19a6e92e4b2..ee0fa18798a 100644 --- a/src/gpu/intel/sycl/engine.cpp +++ b/src/gpu/intel/sycl/engine.cpp @@ -119,13 +119,20 @@ status_t engine_t::create_kernels( "No OpenCL source was found for kernel"); stringstream_t pp_code; - CHECK(gpu::intel::ocl::preprocess_headers(pp_code, source, kernel_ctx)); + CHECK(compute::preprocess_headers(pp_code, source, kernel_ctx)); + std::string code_str = pp_code.str(); std::string build_options = kernel_ctx.options(); build_options += " " + device_info()->get_cl_ext_options(); + gpu::intel::compute::program_src_t src(code_str); + if (src) { build_options += " -g -s " + std::string(src.name()); } + + compute::debugdump_processed_source( + code_str, build_options, device_info()->get_cl_ext_options()); + auto kb_src = syclex::create_kernel_bundle_from_source( - context(), syclex::source_language::opencl, pp_code.str()); + context(), syclex::source_language::opencl, code_str); auto kb_exe = syclex::build( kb_src, syclex::properties {syclex::build_options(build_options)}); *kernels = std::vector(kernel_names.size()); @@ -133,8 +140,7 @@ status_t engine_t::create_kernels( if (!kernel_names[i]) continue; CHECK(interop_kernel_t::make((*kernels)[i], - kb_exe.ext_oneapi_get_kernel(kernel_names[i]), - gpu::intel::compute::program_src_t(pp_code.str()))); + kb_exe.ext_oneapi_get_kernel(kernel_names[i]), src)); } return status::success; diff --git a/src/gpu/intel/sycl/engine.hpp b/src/gpu/intel/sycl/engine.hpp index 5a9ddd78874..58b42f6453c 100644 --- a/src/gpu/intel/sycl/engine.hpp +++ b/src/gpu/intel/sycl/engine.hpp @@ -31,7 +31,7 @@ #include "gpu/intel/ocl/engine.hpp" #include "gpu/intel/ocl/kernel.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "gpu/intel/sycl/compat.hpp" #include "gpu/intel/sycl/utils.hpp" diff --git a/src/gpu/intel/sycl/interop_kernel.cpp b/src/gpu/intel/sycl/interop_kernel.cpp index 960b2412265..eedef84aacc 100644 --- a/src/gpu/intel/sycl/interop_kernel.cpp +++ b/src/gpu/intel/sycl/interop_kernel.cpp @@ -19,8 +19,7 @@ #include "common/verbose.hpp" #include "gpu/intel/compute/types_interop.hpp" #include "gpu/intel/compute/utils.hpp" -#include "gpu/intel/ocl/utils.hpp" -#include "gpu/intel/sycl/l0/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "gpu/intel/sycl/stream.hpp" #include "gpu/intel/sycl/utils.hpp" #include "xpu/sycl/c_types_map.hpp" diff --git a/src/gpu/intel/sycl/l0/utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp deleted file mode 100644 index 28a7a44f09b..00000000000 --- a/src/gpu/intel/sycl/l0/utils.cpp +++ /dev/null @@ -1,370 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "gpu/intel/sycl/l0/utils.hpp" -#include "oneapi/dnnl/dnnl_config.h" - -#include "gpu/intel/jit/binary_format.hpp" -#include "gpu/intel/jit/utils/type_bridge.hpp" -#include "ngen_level_zero.hpp" - -#include "level_zero/ze_api.h" -#include "level_zero/ze_intel_gpu.h" - -#if !defined(__SYCL_COMPILER_VERSION) -#error "Unsupported compiler" -#endif - -#if (__SYCL_COMPILER_VERSION < 20200818) -#error "Level Zero is not supported with this compiler version" -#endif - -#include "common/c_types_map.hpp" -#include "common/verbose.hpp" - -#include "gpu/intel/sycl/utils.hpp" -#include - -#include "gpu/intel/sycl/engine.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace intel { -namespace sycl { -namespace l0 { - -std::string to_string(ze_result_t r) { -#define ZE_STATUS_CASE(status) \ - case status: return #status - switch (r) { - ZE_STATUS_CASE(ZE_RESULT_SUCCESS); - ZE_STATUS_CASE(ZE_RESULT_NOT_READY); - ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST); - ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY); - ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY); - ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET); - ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); - ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED); - ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE); - ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS); - ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN); - ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32); - default: return std::to_string((int)r); - } -#undef ZE_STATUS_CASE -} - -#define ZE_CHECK(f) \ - do { \ - ze_result_t res_ = (f); \ - if (res_ != ZE_RESULT_SUCCESS) { \ - std::string err_str_ = to_string(res_); \ - VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \ - return status::runtime_error; \ - } \ - } while (false) - -#if defined(_WIN32) -#define L0_LIB_NAME "ze_loader.dll" -#elif defined(__linux__) -#define L0_LIB_NAME "libze_loader.so.1" -#endif - -template -F find_ze_symbol(const char *symbol) { - return (F)xpu::find_symbol(L0_LIB_NAME, symbol); -} -#undef L0_LIB_NAME - -#define INDIRECT_L0_CALL(f) \ - template \ - status_t f(Args &&...args) { \ - const ze_init_flags_t default_ze_flags = 0; \ - static auto init_ = find_ze_symbol("zeInit"); \ - if (!init_) return status::runtime_error; \ - ZE_CHECK(init_(default_ze_flags)); \ - static auto f_ = find_ze_symbol(#f); \ - if (!f_) return status::runtime_error; \ - ZE_CHECK(f_(std::forward(args)...)); \ - return status::success; \ - } -INDIRECT_L0_CALL(zeModuleCreate) -INDIRECT_L0_CALL(zeDeviceGetProperties) -INDIRECT_L0_CALL(zeDeviceGetModuleProperties) -INDIRECT_L0_CALL(zeKernelCreate) -INDIRECT_L0_CALL(zeKernelGetBinaryExp) -INDIRECT_L0_CALL(zeModuleGetNativeBinary) -#undef INDIRECT_L0_CALL - -} // namespace l0 - -// FIXME: Currently SYCL doesn't provide any API to get device UUID so -// we query it directly from Level0 with the zeDeviceGetProperties function. -// The `get_device_uuid` function packs 128 bits of the device UUID, which are -// represented as an uint8_t array of size 16, to 2 uint64_t values. -xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) { - static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16, - "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16"); - - auto ze_device_properties = ze_device_properties_t(); - ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - - auto ze_device = xpu::sycl::compat::get_native(dev); - auto status = l0::zeDeviceGetProperties(ze_device, &ze_device_properties); - MAYBE_UNUSED(status); - assert(status == status::success); - - const auto &ze_device_id = ze_device_properties.uuid.id; - - uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {}; - for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) { - size_t shift = i % sizeof(uint64_t) * CHAR_BIT; - uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift); - } - return xpu::device_uuid_t(uuid[0], uuid[1]); -} - -status_t sycl_create_kernels_with_level_zero( - std::vector> &sycl_kernels, - const std::vector &kernel_names, - const gpu::intel::sycl::engine_t *sycl_engine, - const xpu::binary_t &binary) { - auto desc = ze_module_desc_t(); - desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; - desc.format = ZE_MODULE_FORMAT_NATIVE; - desc.inputSize = binary.size(); - desc.pInputModule = binary.data(); - desc.pBuildFlags = ""; - desc.pConstants = nullptr; - - ze_module_handle_t ze_module; - - auto ze_device = xpu::sycl::compat::get_native( - sycl_engine->device()); - auto ze_ctx = xpu::sycl::compat::get_native( - sycl_engine->context()); - - CHECK(l0::zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr)); - ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle - = ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero, - ::sycl::bundle_state::executable>( - {ze_module}, sycl_engine->context()); - - sycl_kernels.resize(kernel_names.size()); - for (size_t i = 0; i < kernel_names.size(); i++) { - if (kernel_names[i] == nullptr) continue; - ze_kernel_handle_t ze_kernel; - ze_kernel_desc_t ze_kernel_desc { - ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_names[i]}; - CHECK(l0::zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel)); - auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>( - {kernel_bundle, ze_kernel}, sycl_engine->context()); - sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k); - } - - return status::success; -} - -status_t get_l0_kernel_binary( - const ::sycl::kernel &kernel, xpu::binary_t &binary) { -#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER - auto l0_kernel = ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>( - kernel); - size_t binary_size = 0; - CHECK(l0::zeKernelGetBinaryExp(l0_kernel, &binary_size, nullptr)); - binary.resize(binary_size); - CHECK(l0::zeKernelGetBinaryExp(l0_kernel, &binary_size, binary.data())); -#else - auto bundle = kernel.get_kernel_bundle(); - auto module_vec - = ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>( - bundle); - auto module = module_vec[0]; - size_t module_binary_size; - CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr)); - binary.resize(module_binary_size); - CHECK(l0::zeModuleGetNativeBinary( - module, &module_binary_size, binary.data())); -#endif - return status::success; -} - -bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) { - auto lhs_ze_handle = xpu::sycl::compat::get_native(lhs); - auto rhs_ze_handle = xpu::sycl::compat::get_native(rhs); - - return lhs_ze_handle == rhs_ze_handle; -} - -status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) { - auto devicePropsIP = ze_device_ip_version_ext_t(); - devicePropsIP.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT; - - auto deviceProps = ze_device_properties_t(); - deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - deviceProps.pNext = &devicePropsIP; - - CHECK(l0::zeDeviceGetProperties(device, &deviceProps)); - ip_version = devicePropsIP.ipVersion; - return status::success; -} - -status_t get_l0_device_enabled_systolic_intel( - ze_device_handle_t device, bool &mayiuse_systolic) { - // Note: supported by Intel Driver 24.05 and onwards - auto deviceModPropsExt = ze_intel_device_module_dp_exp_properties_t(); - deviceModPropsExt.stype - = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES; - - auto deviceModProps = ze_device_module_properties_t(); - deviceModProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES; - deviceModProps.pNext = &deviceModPropsExt; - - CHECK(l0::zeDeviceGetModuleProperties(device, &deviceModProps)); - mayiuse_systolic - = deviceModPropsExt.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS; - return status::success; -} - -status_t get_l0_device_enabled_native_float_atomics( - ze_device_handle_t device, uint64_t &native_extensions) { - using namespace gpu::intel::compute; - - auto fltAtom = ze_float_atomic_ext_properties_t(); - fltAtom.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES; - - auto deviceProps = ze_device_module_properties_t(); - deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES; - deviceProps.pNext = &fltAtom; - - CHECK(l0::zeDeviceGetModuleProperties(device, &deviceProps)); - - ze_device_fp_atomic_ext_flags_t atomic_load_store - = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE - | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE; - ze_device_fp_atomic_ext_flags_t atomic_add - = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD - | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD; - ze_device_fp_atomic_ext_flags_t atomic_min_max - = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX - | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX; - - if ((fltAtom.fp16Flags & atomic_load_store) == atomic_load_store) - native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store; - if ((fltAtom.fp16Flags & atomic_add) == atomic_add) - native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add; - if ((fltAtom.fp16Flags & atomic_min_max) == atomic_min_max) - native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max; - - if ((fltAtom.fp32Flags & atomic_load_store) == atomic_load_store) - native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store; - if ((fltAtom.fp32Flags & atomic_add) == atomic_add) - native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add; - if ((fltAtom.fp32Flags & atomic_min_max) == atomic_min_max) - native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max; - - if ((fltAtom.fp64Flags & atomic_load_store) == atomic_load_store) - native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store; - if ((fltAtom.fp64Flags & atomic_add) == atomic_add) - native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add; - if ((fltAtom.fp64Flags & atomic_min_max) == atomic_min_max) - native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max; - - return status::success; -} - -status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) { - auto eucnt = ze_eu_count_ext_t(); - auto deviceProps = ze_device_properties_t(); - deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - deviceProps.pNext = &eucnt; - - CHECK(l0::zeDeviceGetProperties(device, &deviceProps)); - eu_count = eucnt.numTotalEUs; - return status::success; -} - -status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device, - ze_context_handle_t context, uint32_t &ip_version, - compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_, - uint64_t &native_extensions, bool &mayiuse_systolic, - bool &mayiuse_ngen_kernels) { - using namespace ngen; - ngen::Product product = LevelZeroCodeGenerator::detectHWInfo( - context, device); - - gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family)); - std::memcpy(&product_, &product, sizeof(ngen::Product)); - - mayiuse_systolic = false; - if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic) - != status::success) - mayiuse_systolic = false; - - /* Some old drivers do not report systolic availability. Manually override - systolic availability based on product family. */ - switch (product.family) { - case ProductFamily::DG2: - case ProductFamily::ARL: - case ProductFamily::PVC: mayiuse_systolic = true; - default: break; - } - - CHECK(get_l0_device_enabled_native_float_atomics( - device, native_extensions)); - - auto status - = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine); - if (status != status::success) mayiuse_ngen_kernels = false; - - ip_version = 0; - return get_device_ip(device, ip_version); -} - -} // namespace sycl -} // namespace intel -} // namespace gpu -} // namespace impl -} // namespace dnnl diff --git a/src/gpu/intel/sycl/l0/utils.hpp b/src/gpu/intel/sycl/l0/utils.hpp deleted file mode 100644 index e65932fdb1e..00000000000 --- a/src/gpu/intel/sycl/l0/utils.hpp +++ /dev/null @@ -1,60 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef GPU_INTEL_SYCL_L0_UTILS_HPP -#define GPU_INTEL_SYCL_L0_UTILS_HPP - -#include -#include -#include - -#include "gpu/intel/compute/kernel.hpp" -#include "gpu/intel/sycl/compat.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace intel { -namespace sycl { - -class engine_t; - -xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev); - -status_t sycl_create_kernels_with_level_zero( - std::vector> &sycl_kernels, - const std::vector &kernel_names, - const gpu::intel::sycl::engine_t *sycl_engine, - const xpu::binary_t &binary); - -status_t get_l0_kernel_binary( - const ::sycl::kernel &kernel, xpu::binary_t &binary); - -bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); - -status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device, - ze_context_handle_t context, uint32_t &ip_version, - compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product, - uint64_t &native_extensions, bool &mayiuse_systolic, - bool &mayiuse_ngen_kernels); - -} // namespace sycl -} // namespace intel -} // namespace gpu -} // namespace impl -} // namespace dnnl - -#endif // GPU_INTEL_SYCL_L0_UTILS_HPP diff --git a/src/gpu/intel/sycl/stream.cpp b/src/gpu/intel/sycl/stream.cpp index 9312e35c7d6..6b1ce0c5a61 100644 --- a/src/gpu/intel/sycl/stream.cpp +++ b/src/gpu/intel/sycl/stream.cpp @@ -24,7 +24,7 @@ #include "gpu/intel/sycl/stream.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/intel/sycl/stream.hpp b/src/gpu/intel/sycl/stream.hpp index 6331884c1fc..c9c1a8b196a 100644 --- a/src/gpu/intel/sycl/stream.hpp +++ b/src/gpu/intel/sycl/stream.hpp @@ -35,6 +35,8 @@ #include "gpu/intel/engine.hpp" #include "gpu/intel/stream.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/intel/sycl/utils.cpp b/src/gpu/intel/sycl/utils.cpp index 2bbf1e424ef..c16b0056098 100644 --- a/src/gpu/intel/sycl/utils.cpp +++ b/src/gpu/intel/sycl/utils.cpp @@ -17,9 +17,9 @@ #include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/compute/ukernels.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/l0/utils/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #include "gpu/intel/sycl/engine.hpp" -#include "gpu/intel/sycl/l0/utils.hpp" #include "xpu/ocl/engine_factory.hpp" #include "xpu/ocl/utils.hpp" #include "xpu/sycl/compat.hpp" @@ -32,6 +32,53 @@ namespace gpu { namespace intel { namespace sycl { +// FIXME: Currently SYCL doesn't provide any API to get device UUID so +// we query it directly from Level0 with the zeDeviceGetProperties function. +// The `get_device_uuid` function packs 128 bits of the device UUID, which are +// represented as an uint8_t array of size 16, to 2 uint64_t values. +xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) { + return gpu::intel::l0::get_device_uuid( + xpu::sycl::compat::get_native(dev)); +} + +bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) { + auto lhs_ze_handle = xpu::sycl::compat::get_native(lhs); + auto rhs_ze_handle = xpu::sycl::compat::get_native(rhs); + + return lhs_ze_handle == rhs_ze_handle; +} + +status_t sycl_create_kernels_with_level_zero( + std::vector> &sycl_kernels, + const std::vector &kernel_names, + const gpu::intel::sycl::engine_t *sycl_engine, + const xpu::binary_t &binary) { + auto ze_device = xpu::sycl::compat::get_native( + sycl_engine->device()); + auto ze_ctx = xpu::sycl::compat::get_native( + sycl_engine->context()); + ze_module_handle_t ze_module = nullptr; + std::vector ze_kernels; + + gpu::intel::l0::create_kernels( + ze_device, ze_ctx, kernel_names, binary, &ze_module, ze_kernels); + + ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle + = ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero, + ::sycl::bundle_state::executable>( + {ze_module}, sycl_engine->context()); + + sycl_kernels.resize(kernel_names.size()); + for (size_t i = 0; i < kernel_names.size(); i++) { + if (kernel_names[i] == nullptr) continue; + auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>( + {kernel_bundle, ze_kernels[i]}, sycl_engine->context()); + sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k); + } + + return status::success; +} + ::sycl::nd_range<3> to_sycl_nd_range( const gpu::intel::compute::nd_range_t &range) { const auto &local_range = range.local_range(); @@ -150,7 +197,6 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { } *ocl_dev = d; - return status::success; } @@ -204,6 +250,31 @@ status_t create_ocl_engine( const auto &sycl_ctx = engine->context(); return create_ocl_engine(ocl_engine, engine->device(), &sycl_ctx); } + +static status_t get_l0_kernel_binary( + const ::sycl::kernel &kernel, xpu::binary_t &binary) { + auto bundle = kernel.get_kernel_bundle(); + auto module_vec + = ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>( + bundle); + auto l0_module = module_vec[0]; + CHECK(l0::get_module_binary(l0_module, binary)); + + std::unique_ptr ocl_engine; + const auto &devs = kernel.get_context().get_devices(); + CHECK(create_ocl_engine(&ocl_engine, devs[0])); + xpu::ocl::wrapper_t ocl_program; + CHECK(xpu::ocl::create_program( + ocl_program, ocl_engine->device(), ocl_engine->context(), binary)); + + cl_int err; + auto name = kernel.get_info<::sycl::info::kernel::function_name>(); + auto ocl_kernel = xpu::ocl::make_wrapper( + xpu::ocl::clCreateKernel(ocl_program, name.c_str(), &err)); + OCL_CHECK(err); + CHECK(gpu::intel::ocl::get_ocl_kernel_binary(ocl_kernel, binary)); + return status::success; +} #endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER status_t get_kernel_binary( @@ -212,37 +283,25 @@ status_t get_kernel_binary( assert(!devs.empty()); switch (xpu::sycl::get_backend(devs[0])) { case xpu::sycl::backend_t::level0: { +#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER + auto l0_kernel = ::sycl::get_native< + ::sycl::backend::ext_oneapi_level_zero>(kernel); + CHECK(l0::get_kernel_binary(l0_kernel, binary)); +#else CHECK(get_l0_kernel_binary(kernel, binary)); -#ifndef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER - { - std::unique_ptr - ocl_engine; - CHECK(create_ocl_engine(&ocl_engine, devs[0])); - xpu::ocl::wrapper_t ocl_program; - CHECK(xpu::ocl::create_program(ocl_program, - ocl_engine->device(), ocl_engine->context(), binary)); - - cl_int err; - auto name = kernel.get_info< - ::sycl::info::kernel::function_name>(); - auto ocl_kernel - = xpu::ocl::make_wrapper(xpu::ocl::clCreateKernel( - ocl_program, name.c_str(), &err)); - OCL_CHECK(err); - CHECK(gpu::intel::ocl::get_ocl_kernel_binary( - ocl_kernel, binary)); - } -#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER - return status::success; +#endif + break; } case xpu::sycl::backend_t::opencl: { auto ocl_kernel = ::sycl::get_native<::sycl::backend::opencl>(kernel); CHECK(gpu::intel::ocl::get_ocl_kernel_binary(ocl_kernel, binary)); - return status::success; + break; } default: return status::runtime_error; } + + return status::success; } gpu_utils::device_id_t device_id(const ::sycl::device &dev) { diff --git a/src/gpu/intel/sycl/utils.hpp b/src/gpu/intel/sycl/utils.hpp index 84c6d00bbe2..919fd13677c 100644 --- a/src/gpu/intel/sycl/utils.hpp +++ b/src/gpu/intel/sycl/utils.hpp @@ -29,6 +29,16 @@ namespace sycl { class engine_t; +xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev); + +status_t sycl_create_kernels_with_level_zero( + std::vector> &sycl_kernels, + const std::vector &kernel_names, + const gpu::intel::sycl::engine_t *sycl_engine, + const xpu::binary_t &binary); + +bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); + ::sycl::nd_range<3> to_sycl_nd_range( const gpu::intel::compute::nd_range_t &range); diff --git a/src/graph/backend/dnnl/scratchpad.hpp b/src/graph/backend/dnnl/scratchpad.hpp index b797db0b0a9..0b3cbe25c0c 100644 --- a/src/graph/backend/dnnl/scratchpad.hpp +++ b/src/graph/backend/dnnl/scratchpad.hpp @@ -93,7 +93,6 @@ class temporary_scratchpad_t : public scratchpad_t { #ifdef DNNL_WITH_SYCL void set_deps(::sycl::event event) { e_ = std::move(event); } #endif - #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL void set_deps(cl_event event) { ocl_e_ = event; } #endif @@ -106,7 +105,6 @@ class temporary_scratchpad_t : public scratchpad_t { #ifdef DNNL_WITH_SYCL ::sycl::event e_; #endif - #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL cl_event ocl_e_; #endif diff --git a/src/graph/utils/ocl_check.hpp b/src/graph/utils/ocl_check.hpp index 7c146c34df0..92d41793412 100644 --- a/src/graph/utils/ocl_check.hpp +++ b/src/graph/utils/ocl_check.hpp @@ -31,6 +31,6 @@ #endif #endif -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" #endif diff --git a/src/xpu/ocl/buffer_memory_storage.hpp b/src/xpu/ocl/buffer_memory_storage.hpp index 42b4f1732d2..e4e6cd0c528 100644 --- a/src/xpu/ocl/buffer_memory_storage.hpp +++ b/src/xpu/ocl/buffer_memory_storage.hpp @@ -24,7 +24,7 @@ #include "xpu/ocl/memory_storage_base.hpp" -#include "gpu/intel/ocl/utils.hpp" +#include "gpu/intel/ocl/utils/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/xpu/ocl/utils.cpp b/src/xpu/ocl/utils.cpp index f23e112e910..312c60cf9f9 100644 --- a/src/xpu/ocl/utils.cpp +++ b/src/xpu/ocl/utils.cpp @@ -14,14 +14,8 @@ * limitations under the License. *******************************************************************************/ -// Include for: -// - CL_PLATFORM_NOT_FOUND_KHR -// - CL_UUID_SIZE_KHR -// - CL_DEVICE_UUID_KHR -#include - -#include "xpu/ocl/engine_impl.hpp" #include "xpu/ocl/utils.hpp" +#include "xpu/ocl/engine_impl.hpp" // XXX: Include this header for VERROR_ENGINE. // TODO: Move VERROR_ENGINE and other similar macros to a separate file. @@ -32,133 +26,6 @@ namespace impl { namespace xpu { namespace ocl { -status_t convert_to_dnnl(cl_int cl_status) { - switch (cl_status) { - case CL_SUCCESS: return status::success; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - case CL_OUT_OF_RESOURCES: - case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory; - case CL_DEVICE_NOT_FOUND: - case CL_DEVICE_NOT_AVAILABLE: - case CL_COMPILER_NOT_AVAILABLE: - case CL_PROFILING_INFO_NOT_AVAILABLE: - case CL_MEM_COPY_OVERLAP: - case CL_IMAGE_FORMAT_MISMATCH: - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - case CL_BUILD_PROGRAM_FAILURE: - case CL_MAP_FAILURE: - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - case CL_COMPILE_PROGRAM_FAILURE: - case CL_LINKER_NOT_AVAILABLE: - case CL_LINK_PROGRAM_FAILURE: - case CL_DEVICE_PARTITION_FAILED: - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - case CL_INVALID_PLATFORM: - case CL_INVALID_DEVICE: - case CL_SYMBOL_NOT_FOUND: return status::runtime_error; - case CL_INVALID_VALUE: - case CL_INVALID_DEVICE_TYPE: - case CL_INVALID_CONTEXT: - case CL_INVALID_QUEUE_PROPERTIES: - case CL_INVALID_COMMAND_QUEUE: - case CL_INVALID_HOST_PTR: - case CL_INVALID_MEM_OBJECT: - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - case CL_INVALID_IMAGE_SIZE: - case CL_INVALID_SAMPLER: - case CL_INVALID_BINARY: - case CL_INVALID_BUILD_OPTIONS: - case CL_INVALID_PROGRAM: - case CL_INVALID_PROGRAM_EXECUTABLE: - case CL_INVALID_KERNEL_NAME: - case CL_INVALID_KERNEL_DEFINITION: - case CL_INVALID_KERNEL: - case CL_INVALID_ARG_INDEX: - case CL_INVALID_ARG_VALUE: - case CL_INVALID_ARG_SIZE: - case CL_INVALID_KERNEL_ARGS: - case CL_INVALID_WORK_DIMENSION: - case CL_INVALID_WORK_GROUP_SIZE: - case CL_INVALID_WORK_ITEM_SIZE: - case CL_INVALID_GLOBAL_OFFSET: - case CL_INVALID_EVENT_WAIT_LIST: - case CL_INVALID_EVENT: - case CL_INVALID_OPERATION: - case CL_INVALID_GL_OBJECT: - case CL_INVALID_BUFFER_SIZE: - case CL_INVALID_MIP_LEVEL: - case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments; - - default: return status::runtime_error; - } -} - -// Ordered by value as defined by opencl -const char *convert_cl_int_to_str(cl_int cl_status) { -#define CL_STATUS_CASE(status) \ - case status: return #status - switch (cl_status) { - CL_STATUS_CASE(CL_SUCCESS); - CL_STATUS_CASE(CL_DEVICE_NOT_FOUND); - CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE); - CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE); - CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); - CL_STATUS_CASE(CL_OUT_OF_RESOURCES); - CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY); - CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); - CL_STATUS_CASE(CL_MEM_COPY_OVERLAP); - CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH); - CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); - CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE); - CL_STATUS_CASE(CL_MAP_FAILURE); - CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET); - CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); - CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE); - CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE); - CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE); - CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED); - CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); - CL_STATUS_CASE(CL_INVALID_VALUE); - CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE); - CL_STATUS_CASE(CL_INVALID_PLATFORM); - CL_STATUS_CASE(CL_INVALID_DEVICE); - CL_STATUS_CASE(CL_INVALID_CONTEXT); - CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES); - CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE); - CL_STATUS_CASE(CL_INVALID_HOST_PTR); - CL_STATUS_CASE(CL_INVALID_MEM_OBJECT); - CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); - CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE); - CL_STATUS_CASE(CL_INVALID_SAMPLER); - CL_STATUS_CASE(CL_INVALID_BINARY); - CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS); - CL_STATUS_CASE(CL_INVALID_PROGRAM); - CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE); - CL_STATUS_CASE(CL_INVALID_KERNEL_NAME); - CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION); - CL_STATUS_CASE(CL_INVALID_KERNEL); - CL_STATUS_CASE(CL_INVALID_ARG_INDEX); - CL_STATUS_CASE(CL_INVALID_ARG_VALUE); - CL_STATUS_CASE(CL_INVALID_ARG_SIZE); - CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS); - CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION); - CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE); - CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE); - CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET); - CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST); - CL_STATUS_CASE(CL_INVALID_EVENT); - CL_STATUS_CASE(CL_INVALID_OPERATION); - CL_STATUS_CASE(CL_INVALID_GL_OBJECT); - CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE); - CL_STATUS_CASE(CL_INVALID_MIP_LEVEL); - CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE); - CL_STATUS_CASE(CL_SYMBOL_NOT_FOUND); -#undef CL_STATUS_CASE - default: return "unknown macro name"; - } -} - std::string get_kernel_name(cl_kernel kernel) { size_t name_size; cl_int err = xpu::ocl::clGetKernelInfo( @@ -179,75 +46,6 @@ std::string get_kernel_name(cl_kernel kernel) { return name; } -static std::string get_platform_name(cl_platform_id platform) { - size_t name_size; - cl_int err = xpu::ocl::clGetPlatformInfo( - platform, CL_PLATFORM_NAME, 0, nullptr, &name_size); - // Ignore error. - UNUSED_OCL_RESULT(err); - - // Include null terminator explicitly - to safely overwrite it in - // clGetPlatformInfo - std::string name(name_size, 0); - err = xpu::ocl::clGetPlatformInfo( - platform, CL_PLATFORM_NAME, name_size, &name[0], nullptr); - // Ignore error. - UNUSED_OCL_RESULT(err); - - // Remove the null terminator as std::string already includes it - name.resize(name_size - 1); - return name; -} - -static bool is_intel_platform(cl_platform_id platform) { - auto name = get_platform_name(platform); - return name.find("Intel") != std::string::npos; -} - -status_t get_devices(std::vector *devices, - cl_device_type device_type, cl_uint vendor_id /* = 0x8086 */) { - cl_uint num_platforms = 0; - - cl_int err = xpu::ocl::clGetPlatformIDs(0, nullptr, &num_platforms); - // No platforms - a valid scenario - if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success; - - OCL_CHECK(err); - - std::vector platforms(num_platforms); - OCL_CHECK( - xpu::ocl::clGetPlatformIDs(num_platforms, &platforms[0], nullptr)); - - for (size_t i = 0; i < platforms.size(); ++i) { - if (!is_intel_platform(platforms[i])) continue; - - cl_uint num_devices = 0; - cl_int err = xpu::ocl::clGetDeviceIDs( - platforms[i], device_type, 0, nullptr, &num_devices); - - if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) { - return status::runtime_error; - } - - if (num_devices != 0) { - std::vector plat_devices; - plat_devices.resize(num_devices); - OCL_CHECK(xpu::ocl::clGetDeviceIDs(platforms[i], device_type, - num_devices, &plat_devices[0], nullptr)); - - // Use the devices for the requested vendor only. - for (size_t j = 0; j < plat_devices.size(); ++j) { - cl_uint v_id; - OCL_CHECK(xpu::ocl::clGetDeviceInfo(plat_devices[j], - CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &v_id, nullptr)); - if (v_id == vendor_id) { devices->push_back(plat_devices[j]); } - } - } - } - // No devices found but still return success - return status::success; -} - status_t get_devices(std::vector *devices, std::vector> *sub_devices, cl_device_type device_type) { @@ -335,31 +133,6 @@ status_t create_program(ocl::wrapper_t &ocl_program, return status::success; } -#ifndef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER -status_t get_device_uuid(xpu::device_uuid_t &uuid, cl_device_id ocl_dev) { - // This function is used only with SYCL that works with OpenCL 3.0 - // that supports `cl_khr_device_uuid` extension. -#if defined(cl_khr_device_uuid) - static_assert( - CL_UUID_SIZE_KHR == 16, "CL_UUID_SIZE_KHR is expected to be 16"); - - cl_uchar ocl_dev_uuid[CL_UUID_SIZE_KHR] = {}; - OCL_CHECK(xpu::ocl::clGetDeviceInfo(ocl_dev, CL_DEVICE_UUID_KHR, - CL_UUID_SIZE_KHR, ocl_dev_uuid, nullptr)); - - uint64_t uuid_packed[CL_UUID_SIZE_KHR / sizeof(uint64_t)] = {}; - for (size_t i = 0; i < CL_UUID_SIZE_KHR; ++i) { - size_t shift = i % sizeof(uint64_t) * CHAR_BIT; - uuid_packed[i / sizeof(uint64_t)] - |= (((uint64_t)ocl_dev_uuid[i]) << shift); - } - uuid = xpu::device_uuid_t(uuid_packed[0], uuid_packed[1]); - return status::success; -#endif - return status::runtime_error; -} -#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER - status_t check_device( engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) { assert(dev && ctx); diff --git a/src/xpu/ocl/utils.hpp b/src/xpu/ocl/utils.hpp index 540d5224ad8..ececb6fcd3d 100644 --- a/src/xpu/ocl/utils.hpp +++ b/src/xpu/ocl/utils.hpp @@ -18,6 +18,11 @@ #define XPU_OCL_UTILS_HPP #include +// Include for: +// - CL_PLATFORM_NOT_FOUND_KHR +// - CL_UUID_SIZE_KHR +// - CL_DEVICE_UUID_KHR +#include #include "oneapi/dnnl/dnnl_config.h" @@ -38,8 +43,134 @@ namespace impl { namespace xpu { namespace ocl { -status_t convert_to_dnnl(cl_int cl_status); -const char *convert_cl_int_to_str(cl_int cl_status); +enum { CL_SYMBOL_NOT_FOUND = -128 }; + +inline status_t convert_to_dnnl(cl_int cl_status) { + switch (cl_status) { + case CL_SUCCESS: return status::success; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + case CL_OUT_OF_RESOURCES: + case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory; + case CL_DEVICE_NOT_FOUND: + case CL_DEVICE_NOT_AVAILABLE: + case CL_COMPILER_NOT_AVAILABLE: + case CL_PROFILING_INFO_NOT_AVAILABLE: + case CL_MEM_COPY_OVERLAP: + case CL_IMAGE_FORMAT_MISMATCH: + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + case CL_BUILD_PROGRAM_FAILURE: + case CL_MAP_FAILURE: + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + case CL_COMPILE_PROGRAM_FAILURE: + case CL_LINKER_NOT_AVAILABLE: + case CL_LINK_PROGRAM_FAILURE: + case CL_DEVICE_PARTITION_FAILED: + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + case CL_INVALID_PLATFORM: + case CL_INVALID_DEVICE: + case CL_SYMBOL_NOT_FOUND: return status::runtime_error; + case CL_INVALID_VALUE: + case CL_INVALID_DEVICE_TYPE: + case CL_INVALID_CONTEXT: + case CL_INVALID_QUEUE_PROPERTIES: + case CL_INVALID_COMMAND_QUEUE: + case CL_INVALID_HOST_PTR: + case CL_INVALID_MEM_OBJECT: + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + case CL_INVALID_IMAGE_SIZE: + case CL_INVALID_SAMPLER: + case CL_INVALID_BINARY: + case CL_INVALID_BUILD_OPTIONS: + case CL_INVALID_PROGRAM: + case CL_INVALID_PROGRAM_EXECUTABLE: + case CL_INVALID_KERNEL_NAME: + case CL_INVALID_KERNEL_DEFINITION: + case CL_INVALID_KERNEL: + case CL_INVALID_ARG_INDEX: + case CL_INVALID_ARG_VALUE: + case CL_INVALID_ARG_SIZE: + case CL_INVALID_KERNEL_ARGS: + case CL_INVALID_WORK_DIMENSION: + case CL_INVALID_WORK_GROUP_SIZE: + case CL_INVALID_WORK_ITEM_SIZE: + case CL_INVALID_GLOBAL_OFFSET: + case CL_INVALID_EVENT_WAIT_LIST: + case CL_INVALID_EVENT: + case CL_INVALID_OPERATION: + case CL_INVALID_GL_OBJECT: + case CL_INVALID_BUFFER_SIZE: + case CL_INVALID_MIP_LEVEL: + case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments; + + default: return status::runtime_error; + } +} + +// Ordered by value as defined by opencl +inline const char *convert_cl_int_to_str(cl_int cl_status) { +#define CL_STATUS_CASE(status) \ + case status: return #status + switch (cl_status) { + CL_STATUS_CASE(CL_SUCCESS); + CL_STATUS_CASE(CL_DEVICE_NOT_FOUND); + CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE); + CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE); + CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CL_STATUS_CASE(CL_OUT_OF_RESOURCES); + CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY); + CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); + CL_STATUS_CASE(CL_MEM_COPY_OVERLAP); + CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH); + CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE); + CL_STATUS_CASE(CL_MAP_FAILURE); + CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET); + CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE); + CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE); + CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE); + CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED); + CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); + CL_STATUS_CASE(CL_INVALID_VALUE); + CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE); + CL_STATUS_CASE(CL_INVALID_PLATFORM); + CL_STATUS_CASE(CL_INVALID_DEVICE); + CL_STATUS_CASE(CL_INVALID_CONTEXT); + CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES); + CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE); + CL_STATUS_CASE(CL_INVALID_HOST_PTR); + CL_STATUS_CASE(CL_INVALID_MEM_OBJECT); + CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE); + CL_STATUS_CASE(CL_INVALID_SAMPLER); + CL_STATUS_CASE(CL_INVALID_BINARY); + CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS); + CL_STATUS_CASE(CL_INVALID_PROGRAM); + CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE); + CL_STATUS_CASE(CL_INVALID_KERNEL_NAME); + CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION); + CL_STATUS_CASE(CL_INVALID_KERNEL); + CL_STATUS_CASE(CL_INVALID_ARG_INDEX); + CL_STATUS_CASE(CL_INVALID_ARG_VALUE); + CL_STATUS_CASE(CL_INVALID_ARG_SIZE); + CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS); + CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION); + CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE); + CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE); + CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET); + CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST); + CL_STATUS_CASE(CL_INVALID_EVENT); + CL_STATUS_CASE(CL_INVALID_OPERATION); + CL_STATUS_CASE(CL_INVALID_GL_OBJECT); + CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE); + CL_STATUS_CASE(CL_INVALID_MIP_LEVEL); + CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE); + CL_STATUS_CASE(CL_SYMBOL_NOT_FOUND); +#undef CL_STATUS_CASE + default: return "unknown macro name"; + } +} #define MAYBE_REPORT_ERROR(msg) \ do { \ @@ -91,8 +222,6 @@ F find_ocl_symbol(const char *symbol) { } #undef OCL_LIB_NAME -enum { CL_SYMBOL_NOT_FOUND = -128 }; - // In case the OCL symbol is not found: // - if the return value of OCL function is cl_int, return CL_SYMBOL_NOT_FOUND // - if the return value of OCL function is a pointer, return nullptr @@ -394,8 +523,74 @@ struct ext_func_t { std::string get_kernel_name(cl_kernel kernel); -status_t get_devices(std::vector *devices, - cl_device_type device_type, cl_uint vendor_id = 0x8086); +static inline std::string get_platform_name(cl_platform_id platform) { + size_t name_size; + cl_int err = xpu::ocl::clGetPlatformInfo( + platform, CL_PLATFORM_NAME, 0, nullptr, &name_size); + // Ignore error. + UNUSED_OCL_RESULT(err); + + // Include null terminator explicitly - to safely overwrite it in + // clGetPlatformInfo + std::string name(name_size, 0); + err = xpu::ocl::clGetPlatformInfo( + platform, CL_PLATFORM_NAME, name_size, &name[0], nullptr); + // Ignore error. + UNUSED_OCL_RESULT(err); + + // Remove the null terminator as std::string already includes it + name.resize(name_size - 1); + return name; +} + +static inline bool is_intel_platform(cl_platform_id platform) { + auto name = get_platform_name(platform); + return name.find("Intel") != std::string::npos; +} + +inline status_t get_devices(std::vector *devices, + cl_device_type device_type, cl_uint vendor_id = 0x8086) { + cl_uint num_platforms = 0; + + cl_int err = xpu::ocl::clGetPlatformIDs(0, nullptr, &num_platforms); + // No platforms - a valid scenario + if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success; + + OCL_CHECK(err); + + std::vector platforms(num_platforms); + OCL_CHECK( + xpu::ocl::clGetPlatformIDs(num_platforms, &platforms[0], nullptr)); + + for (size_t i = 0; i < platforms.size(); ++i) { + if (!is_intel_platform(platforms[i])) continue; + + cl_uint num_devices = 0; + cl_int err = xpu::ocl::clGetDeviceIDs( + platforms[i], device_type, 0, nullptr, &num_devices); + + if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) { + return status::runtime_error; + } + + if (num_devices != 0) { + std::vector plat_devices; + plat_devices.resize(num_devices); + OCL_CHECK(xpu::ocl::clGetDeviceIDs(platforms[i], device_type, + num_devices, &plat_devices[0], nullptr)); + + // Use the devices for the requested vendor only. + for (size_t j = 0; j < plat_devices.size(); ++j) { + cl_uint v_id; + OCL_CHECK(xpu::ocl::clGetDeviceInfo(plat_devices[j], + CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &v_id, nullptr)); + if (v_id == vendor_id) { devices->push_back(plat_devices[j]); } + } + } + } + // No devices found but still return success + return status::success; +} status_t get_devices(std::vector *devices, std::vector> *sub_devices, @@ -409,9 +604,29 @@ cl_platform_id get_platform(engine_t *engine); status_t create_program(ocl::wrapper_t &ocl_program, cl_device_id dev, cl_context ctx, const xpu::binary_t &binary); -#ifndef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER -status_t get_device_uuid(xpu::device_uuid_t &uuid, cl_device_id ocl_dev); -#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER +inline status_t get_device_uuid( + xpu::device_uuid_t &uuid, cl_device_id ocl_dev) { + // This function is used only with SYCL that works with OpenCL 3.0 + // that supports `cl_khr_device_uuid` extension. +#if defined(cl_khr_device_uuid) + static_assert( + CL_UUID_SIZE_KHR == 16, "CL_UUID_SIZE_KHR is expected to be 16"); + + cl_uchar ocl_dev_uuid[CL_UUID_SIZE_KHR] = {}; + OCL_CHECK(xpu::ocl::clGetDeviceInfo(ocl_dev, CL_DEVICE_UUID_KHR, + CL_UUID_SIZE_KHR, ocl_dev_uuid, nullptr)); + + uint64_t uuid_packed[CL_UUID_SIZE_KHR / sizeof(uint64_t)] = {}; + for (size_t i = 0; i < CL_UUID_SIZE_KHR; ++i) { + size_t shift = i % sizeof(uint64_t) * CHAR_BIT; + uuid_packed[i / sizeof(uint64_t)] + |= (((uint64_t)ocl_dev_uuid[i]) << shift); + } + uuid = xpu::device_uuid_t(uuid_packed[0], uuid_packed[1]); + return status::success; +#endif + return status::runtime_error; +} // Check for three conditions: // 1. Device and context are compatible, i.e. the device belongs to @@ -430,9 +645,25 @@ cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret); #endif +inline status_t get_extensions(cl_device_id dev, std::string &ext) { + cl_int err = CL_SUCCESS; + + size_t param_size = 0; + err = xpu::ocl::clGetDeviceInfo( + dev, CL_DEVICE_EXTENSIONS, 0, nullptr, ¶m_size); + OCL_CHECK(err); + + ext.resize(param_size, '\0'); + err = xpu::ocl::clGetDeviceInfo( + dev, CL_DEVICE_EXTENSIONS, param_size, &ext[0], ¶m_size); + OCL_CHECK(err); + + return status::success; +} + } // namespace ocl } // namespace xpu } // namespace impl } // namespace dnnl -#endif +#endif // XPU_OCL_UTILS_HPP diff --git a/src/xpu/sycl/utils.cpp b/src/xpu/sycl/utils.cpp index 22d27e90691..fa4782d137f 100644 --- a/src/xpu/sycl/utils.cpp +++ b/src/xpu/sycl/utils.cpp @@ -24,7 +24,7 @@ #include "common/engine.hpp" #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL -#include "gpu/intel/sycl/l0/utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA diff --git a/third_party/ngen/ngen_level_zero.hpp b/third_party/ngen/ngen_level_zero.hpp index 8ebb4a751c7..7f5f2efb8e6 100644 --- a/third_party/ngen/ngen_level_zero.hpp +++ b/third_party/ngen/ngen_level_zero.hpp @@ -92,12 +92,11 @@ class LevelZeroCodeGenerator : public ELFCodeGenerator explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {} LevelZeroCodeGenerator(LevelZeroCodeGenerator&&) = default; - inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = ""); + inline std::pair getKernel(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = ""); + bool binaryIsZebin() { return true; } + static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device); static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device); - - static bool binaryIsZebin() { return true; } - }; #define NGEN_FORWARD_LEVEL_ZERO(hw) NGEN_FORWARD_ELF(hw) @@ -140,7 +139,7 @@ static inline std::vector getDummyModuleBinary(ze_context_handle_t cont }; /* namespace detail */ template -ze_module_handle_t LevelZeroCodeGenerator::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options) +std::pair LevelZeroCodeGenerator::getKernel(ze_context_handle_t context, ze_device_handle_t device, const std::string &options) { using super = ELFCodeGenerator; @@ -162,7 +161,12 @@ ze_module_handle_t LevelZeroCodeGenerator::getModule(ze_context_handle_t con if (module == nullptr) throw level_zero_error{}; - return module; + auto kernelName = ELFCodeGenerator::interface_.getExternalName().c_str(); + ze_kernel_handle_t kernelL0; + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName}; + detail::handleL0(dynamic::zeKernelCreate(module, &kernelDesc, &kernelL0)); + + return std::make_pair(module, kernelL0); } template