From 38f889e77de6050818e10cd1681149ba87e44f6f Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 30 Jun 2025 12:55:01 +0000 Subject: [PATCH 01/74] Add L0 support for gpu --- .github/workflows/android_arm64.yml | 1 + .github/workflows/linux_riscv.yml | 1 + cmake/features.cmake | 9 + .../runtime/intel_gpu/remote_properties.hpp | 5 + src/plugins/intel_gpu/CMakeLists.txt | 3 + src/plugins/intel_gpu/cmake/utils.cmake | 15 + .../intel_gpu/plugin/remote_context.hpp | 25 ++ .../include/intel_gpu/runtime/device.hpp | 24 + .../include/intel_gpu/runtime/device_info.hpp | 47 +- .../runtime/engine_configuration.hpp | 8 +- .../include/intel_gpu/runtime/kernel.hpp | 7 +- .../intel_gpu/src/graph/CMakeLists.txt | 1 + .../graph_optimizer/add_required_reorders.cpp | 1 + .../graph/graph_optimizer/handle_reshape.cpp | 1 + .../mark_runtime_skippable_nodes.cpp | 2 +- .../graph_optimizer/post_optimize_weights.cpp | 1 + .../graph_optimizer/prepare_buffer_fusing.cpp | 1 + .../prepare_primitive_fusing.cpp | 1 + .../prepare_primitive_fusing_through.cpp | 1 + .../remove_redundant_reorders.cpp | 1 + .../src/graph/impls/ocl/custom_primitive.cpp | 4 +- .../src/graph/impls/ocl/kernels_cache.cpp | 15 +- .../src/graph/impls/ocl/kernels_cache.hpp | 2 +- .../graph/impls/ocl/multi_stage_primitive.hpp | 7 +- .../src/graph/impls/ocl/primitive_base.hpp | 18 +- .../intel_gpu/src/graph/impls/ocl/reorder.hpp | 2 + .../graph/impls/ocl_v2/primitive_ocl_base.hpp | 4 +- .../src/graph/include/primitive_inst.h | 2 +- src/plugins/intel_gpu/src/graph/program.cpp | 2 +- .../convolution/convolution_kernel_ref.cpp | 2 +- src/plugins/intel_gpu/src/plugin/plugin.cpp | 8 +- .../intel_gpu/src/plugin/remote_context.cpp | 17 +- .../intel_gpu/src/runtime/CMakeLists.txt | 19 +- .../intel_gpu/src/runtime/device_query.cpp | 21 +- src/plugins/intel_gpu/src/runtime/engine.cpp | 6 + src/plugins/intel_gpu/src/runtime/memory.cpp | 2 + .../intel_gpu/src/runtime/ocl/ocl_device.cpp | 38 +- .../src/runtime/ocl/ocl_device_detector.cpp | 23 - .../src/runtime/ocl/ocl_device_detector.hpp | 2 - .../intel_gpu/src/runtime/ocl/ocl_kernel.cpp | 42 ++ .../intel_gpu/src/runtime/ocl/ocl_kernel.hpp | 2 + .../src/runtime/ze/ze_base_event.hpp | 25 ++ .../intel_gpu/src/runtime/ze/ze_common.cpp | 39 ++ .../intel_gpu/src/runtime/ze/ze_common.hpp | 32 ++ .../intel_gpu/src/runtime/ze/ze_device.cpp | 273 ++++++++++++ .../intel_gpu/src/runtime/ze/ze_device.hpp | 43 ++ .../src/runtime/ze/ze_device_detector.cpp | 118 +++++ .../src/runtime/ze/ze_device_detector.hpp | 32 ++ .../intel_gpu/src/runtime/ze/ze_engine.cpp | 266 +++++++++++ .../intel_gpu/src/runtime/ze/ze_engine.hpp | 64 +++ .../src/runtime/ze/ze_engine_factory.hpp | 18 + .../intel_gpu/src/runtime/ze/ze_event.cpp | 216 +++++++++ .../intel_gpu/src/runtime/ze/ze_event.hpp | 94 ++++ .../src/runtime/ze/ze_event_pool.cpp | 78 ++++ .../src/runtime/ze/ze_event_pool.hpp | 46 ++ .../intel_gpu/src/runtime/ze/ze_kernel.hpp | 62 +++ .../intel_gpu/src/runtime/ze/ze_memory.cpp | 236 ++++++++++ .../intel_gpu/src/runtime/ze/ze_memory.hpp | 153 +++++++ .../intel_gpu/src/runtime/ze/ze_stream.cpp | 348 +++++++++++++++ .../intel_gpu/src/runtime/ze/ze_stream.hpp | 69 +++ .../intel_gpu/tests/functional/CMakeLists.txt | 2 + ..._tests.cpp => ocl_remote_tensor_tests.cpp} | 3 + .../ze_remote_tensor_tests.cpp | 20 + .../intel_gpu/tests/unit/CMakeLists.txt | 1 + .../tests/unit/module_tests/device_test.cpp | 6 +- .../tests/unit/test_utils/test_utils.cpp | 2 +- thirdparty/dependencies.cmake | 2 +- thirdparty/level_zero/CMakeLists.txt | 2 +- thirdparty/level_zero/ze_intel_gpu.h | 412 ++++++++++++++++++ thirdparty/level_zero/ze_stypes.h | 43 ++ 70 files changed, 2962 insertions(+), 136 deletions(-) create mode 100644 src/plugins/intel_gpu/cmake/utils.cmake create mode 100644 src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp rename src/plugins/intel_gpu/tests/functional/remote_tensor_tests/{gpu_remote_tensor_tests.cpp => ocl_remote_tensor_tests.cpp} (99%) create mode 100644 src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp create mode 100644 thirdparty/level_zero/ze_intel_gpu.h create mode 100644 thirdparty/level_zero/ze_stypes.h diff --git a/.github/workflows/android_arm64.yml b/.github/workflows/android_arm64.yml index 77711cacce1a09..e62ceca6f7a26b 100644 --- a/.github/workflows/android_arm64.yml +++ b/.github/workflows/android_arm64.yml @@ -128,6 +128,7 @@ jobs: git submodule update --init -- ${OPENVINO_REPO}/thirdparty/json git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gtest git submodule update --init -- ${OPENVINO_REPO}/thirdparty/gflags + git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero popd - name: Clone vcpkg diff --git a/.github/workflows/linux_riscv.yml b/.github/workflows/linux_riscv.yml index 2f57ace4bd7ef2..6b34b834497295 100644 --- a/.github/workflows/linux_riscv.yml +++ b/.github/workflows/linux_riscv.yml @@ -150,6 +150,7 @@ jobs: git submodule update --init -- ${OPENVINO_REPO}/thirdparty/telemetry git submodule update --init -- ${OPENVINO_REPO}/src/plugins/intel_cpu git submodule update --init -- ${OPENVINO_REPO}/thirdparty/flatbuffers/flatbuffers + git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero popd # diff --git a/cmake/features.cmake b/cmake/features.cmake index dc8ebeeb9371ad..0d4cb16fd241dd 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -41,6 +41,15 @@ else() set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON) endif() +set(OV_GPU_DEFAULT_RT "L0") +if (ENABLE_INTEL_GPU) + ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL) + if (GPU_RT_TYPE STREQUAL "L0") + # There's no interop with native L0 in onednn API. Temporary disable onednn when L0 runtime is selected + set(ENABLE_ONEDNN_FOR_GPU_DEFAULT OFF) + endif() +endif() + ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF) ov_dependent_option (ENABLE_INTEL_NPU "NPU plugin for OpenVINO runtime" ON "X86_64;WIN32 OR LINUX" OFF) diff --git a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp index 53c8de921a747d..cad287b56059bb 100644 --- a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp +++ b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp @@ -24,6 +24,7 @@ using gpu_handle_param = void*; enum class ContextType { OCL = 0, //!< Pure OpenCL context VA_SHARED = 1, //!< Context shared with a video decoding device + ZE = 2, //!< Pure Level0 context }; /** @cond INTERNAL */ @@ -33,6 +34,8 @@ inline std::ostream& operator<<(std::ostream& os, const ContextType& context_typ return os << "OCL"; case ContextType::VA_SHARED: return os << "VA_SHARED"; + case ContextType::ZE: + return os << "ZE"; default: OPENVINO_THROW("Unsupported context type"); } @@ -43,6 +46,8 @@ inline std::istream& operator>>(std::istream& is, ContextType& context_type) { is >> str; if (str == "OCL") { context_type = ContextType::OCL; + } else if (str == "ZE") { + context_type = ContextType::ZE; } else if (str == "VA_SHARED") { context_type = ContextType::VA_SHARED; } else { diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 9556ab5873c616..58f5e74330a4d4 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -8,6 +8,8 @@ endif() set (TARGET_NAME "openvino_intel_gpu_plugin") +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) + if(OV_COMPILER_IS_INTEL_LLVM) # For windows we need to disable warning as error option to make FindSYCL.cmake work if (WIN32) @@ -77,6 +79,7 @@ target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include/) ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake new file mode 100644 index 00000000000000..1dc0edebb5fe2d --- /dev/null +++ b/src/plugins/intel_gpu/cmake/utils.cmake @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +function(ov_gpu_set_runtime_interface_for TARGET_NAME) + if(GPU_RT_TYPE STREQUAL "L0") + target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1) + target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero) + elseif(GPU_RT_TYPE STREQUAL "OCL") + target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1) + target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL) + else() + message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported") + endif() +endfunction() diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp index 7425f701f88710..3606e95e5d9521 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp @@ -8,6 +8,7 @@ # define NOMINMAX #endif +#include "intel_gpu/runtime/engine_configuration.hpp" #include "openvino/runtime/intel_gpu/remote_properties.hpp" #include "openvino/runtime/iremote_context.hpp" @@ -22,6 +23,26 @@ namespace ov::intel_gpu { +inline std::pair get_device_query_params() { + #ifdef OV_GPU_WITH_ZE_RT + auto runtime_type = cldnn::runtime_types::ze; + #ifdef OV_GPU_WITH_SYCL + auto engine_type = cldnn::engine_types::sycl; + #else + auto engine_type = cldnn::engine_types::ze; + #endif + #else + auto runtime_type = cldnn::runtime_types::ocl; + #ifdef OV_GPU_WITH_SYCL + auto engine_type = cldnn::engine_types::sycl; + #else + auto engine_type = cldnn::engine_types::ocl; + #endif + #endif + + return {engine_type, runtime_type}; +} + class RemoteContextImpl : public ov::IRemoteContext { public: using Ptr = std::shared_ptr; @@ -93,7 +114,11 @@ class RemoteContextImpl : public ov::IRemoteContext { ov::intel_gpu::gpu_handle_param m_va_display = nullptr; ov::intel_gpu::gpu_handle_param m_external_queue = nullptr; +#ifdef OV_GPU_WITH_ZE_RT + ContextType m_type = ContextType::ZE; +#else ContextType m_type = ContextType::OCL; +#endif std::string m_device_name = ""; static const size_t cache_capacity = 100; cldnn::LruCache m_memory_cache = cldnn::LruCache(cache_capacity); diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp index 7c567e877d7552..ef885414c6f1c0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp @@ -33,4 +33,28 @@ struct device { virtual ~device() = default; }; +// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration +// Lower priority value means lower device ID +// Current behavior is: Intel iGPU < Intel dGPU < any other GPU +// Order of Intel dGPUs is undefined and depends on the OCL impl +// Order of other vendor GPUs is undefined and depends on the OCL impl +inline size_t get_device_priority(const cldnn::device_info& info) { + if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) { + return 0; + } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) { + return 1; + } else { + return std::numeric_limits::max(); + } +} + +inline std::vector sort_devices(const std::vector& devices_list) { + std::vector sorted_list = devices_list; + std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1, device::ptr d2) { + return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info()); + }); + + return sorted_list; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 319ca366a9dea2..738515a67b9a1b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -46,13 +46,13 @@ struct gfx_version { < std::tie(r.major, r.minor, r.revision); // same order } - bool operator==(const gfx_version& other) { + bool operator==(const gfx_version& other) const { return major == other.major && minor == other.minor && revision == other.revision; } - bool operator!=(const gfx_version& other) { + bool operator!=(const gfx_version& other) const { return !(*this == other); } }; @@ -63,14 +63,14 @@ struct pci_bus_info { uint32_t pci_device = 0; uint32_t pci_function = 0; - bool operator==(const pci_bus_info& other) { + bool operator==(const pci_bus_info& other) const { return pci_domain == other.pci_domain && pci_bus == other.pci_bus && pci_device == other.pci_device && pci_function == other.pci_function; } - bool operator!=(const pci_bus_info& other) { + bool operator!=(const pci_bus_info& other) const { return !(*this == other); } }; @@ -105,6 +105,8 @@ struct device_info { bool supports_imad; ///< Does engine support int8 mad. bool supports_immad; ///< Does engine support int8 multi mad. + bool supports_mutable_command_list; ///< Does the target runtime/device support mutable command list feature + bool supports_usm; ///< Does engine support unified shared memory. bool has_separate_cache; ///< Does the target hardware has separate cache for usm_device and usm_host @@ -129,8 +131,45 @@ struct device_info { pci_bus_info pci_info; ///< PCI bus information for the device + uint64_t timer_resolution; ///< Resolution of device timer used for profiling in cycles/sec + uint32_t kernel_timestamp_valid_bits; ///< Number of valid bits in the kernel timestamp values + uint32_t compute_queue_group_ordinal; ///< Ordinal of the command queue group with compute support + uint32_t device_memory_ordinal; ///< Ordinal of the selected global device memory + ov::device::UUID uuid; ///< UUID of the gpu device ov::device::LUID luid; ///< LUID of the gpu device + + inline bool is_same_device(const device_info &other) const { + // Relying solely on the UUID is not reliable in all the cases (particularly on legacy platforms), + // where the UUID may be missing or incorrectly generated + // Therefore, we also validate other attributes + if (uuid.uuid != other.uuid.uuid) + return false; + + if (pci_info != other.pci_info) + return false; + + if (sub_device_idx != other.sub_device_idx) + return false; + + if (vendor_id != other.vendor_id || + dev_name != other.dev_name || + driver_version != other.driver_version) + return false; + + if (dev_type != other.dev_type || + gfx_ver != other.gfx_ver || + arch != other.arch) + return false; + + if (ip_version != other.ip_version || device_id != other.device_id) + return false; + + if (execution_units_count != other.execution_units_count || max_global_mem_size != other.max_global_mem_size) + return false; + + return true; + } }; /// @} diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp index abe01b0cc5da22..8eea9df0169ab2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp @@ -4,20 +4,22 @@ #pragma once -#include +#include namespace cldnn { /// @brief Defines available engine types enum class engine_types : int32_t { ocl, - sycl + sycl, + ze }; inline std::ostream& operator<<(std::ostream& os, const engine_types& type) { switch (type) { case engine_types::ocl: os << "ocl"; break; case engine_types::sycl: os << "sycl"; break; + case engine_types::ze: os << "ze"; break; default: os << "unknown"; break; } @@ -27,11 +29,13 @@ inline std::ostream& operator<<(std::ostream& os, const engine_types& type) { /// @brief Defines available runtime types enum class runtime_types : int32_t { ocl, + ze, }; inline std::ostream& operator<<(std::ostream& os, const runtime_types& type) { switch (type) { case runtime_types::ocl: os << "ocl"; break; + case runtime_types::ze: os << "ze"; break; default: os << "unknown"; break; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp index 8dcd9d07d0f246..6c55df8507c812 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp @@ -4,9 +4,6 @@ #pragma once -#include "kernel_args.hpp" -#include "event.hpp" - #include #include @@ -19,7 +16,9 @@ class kernel { using ptr = std::shared_ptr; virtual std::shared_ptr clone(bool reuse_kernel_handle = false) const = 0; virtual ~kernel() = default; - virtual std::string get_id() const { return ""; } + + virtual std::string get_id() const = 0; + virtual std::vector get_binary() const = 0; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/CMakeLists.txt index b3ee51e242d81f..4611d731429ce4 100644 --- a/src/plugins/intel_gpu/src/graph/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/graph/CMakeLists.txt @@ -137,6 +137,7 @@ ov_build_target_faster(${TARGET_NAME} PCH) target_compile_options(${TARGET_NAME} PRIVATE ${COMMON_COMPILE_OPTIONS}) ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp index e2523ab8c870f8..74e6165a685d60 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp @@ -9,6 +9,7 @@ #include "fully_connected_inst.h" #include "assign_inst.h" #include "mvn_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp index 3f07b505388de4..b7abdd649f71bf 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp @@ -5,6 +5,7 @@ #include "pass_manager.h" #include "program_helpers.h" #include "reshape_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp index 0eb670dd067767..9935c39f02e902 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp @@ -12,7 +12,7 @@ #include "shape_of_inst.h" #include "broadcast_inst.h" #include "non_zero_inst.h" -#include "non_max_suppression_inst.h" +#include "reorder_inst.h" #include "unique_inst.hpp" #include "scatter_elements_update_inst.h" #include "scatter_update_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index c7296ea35b8940..56239fb0263165 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -9,6 +9,7 @@ #include "convolution_inst.h" #include "deconvolution_inst.h" #include "fully_connected_inst.h" +#include "reorder_inst.h" #include "lstm_seq_inst.h" #include "gru_seq_inst.h" #include "intel_gpu/runtime/format.hpp" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 124d61db911415..1b53f49c673fa1 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -11,6 +11,7 @@ #include "crop_inst.h" #include "eltwise_inst.h" #include "gemm_inst.h" +#include "reorder_inst.h" #include "assign_inst.h" #include "read_value_inst.h" #include "reshape_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 8f0addbd605652..a1dec0ca256e5f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -12,6 +12,7 @@ #include "activation_inst.h" #include "batch_to_space_inst.h" #include "crop_inst.h" +#include "reorder_inst.h" #include "eltwise_inst.h" #include "gemm_inst.h" #include "lrn_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp index 8021237c68179e..4282585885e9d3 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp @@ -10,6 +10,7 @@ #include "data_inst.h" #include "eltwise_inst.h" #include "mutable_data_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 5a661a90989186..9dd5fbbc6a1374 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -22,6 +22,7 @@ #include "fully_connected_inst.h" #include "group_normalization_inst.h" #include "mvn_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index 4aeadc7a297da3..816efd32e37293 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -62,8 +62,8 @@ struct custom_gpu_primitive_impl : typed_primitive_impl { this->can_share_kernels = kernels_cache.get_kernels_reuse(); } - void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0])); + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0], e)); this->can_share_kernels = kernels_cache.get_kernels_reuse(); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp index 9bd724e03e80a9..2d448e99ce3264 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp @@ -493,11 +493,11 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co } } -kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const { +kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const engine& e) const { auto res = _cached_kernels.find(id); OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!"); - return res->second->clone(_reuse_kernels); + return e.prepare_kernel(res->second->clone(_reuse_kernels)); } std::vector kernels_cache::get_kernels(const kernel_impl_params& params) const { @@ -642,15 +642,12 @@ void kernels_cache::add_kernels_source(const kernel_impl_params& params, } std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const { - auto ocl_kernel = std::static_pointer_cast(kernel); - const auto& entry_point = ocl_kernel->get_handle().getInfo(); - auto program = ocl_kernel->get_handle().getInfo(); - cl::vector program_binaries = getProgramBinaries(std::move(program)); + auto program_binaries = kernel->get_binary(); auto iter = _cached_binaries.find(program_binaries); OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries"); - return entry_point + "@" + std::to_string(iter->second); + return kernel->get_id() + "@" + std::to_string(iter->second); } std::vector kernels_cache::get_cached_kernel_ids(const std::vector& kernels) const { @@ -668,9 +665,7 @@ void kernels_cache::add_to_cached_kernels(const std::vector& kernel static std::atomic id_gen{0}; for (auto& kernel : kernels) { - auto ocl_kernel = std::static_pointer_cast(kernel); - auto program = ocl_kernel->get_handle().getInfo(); - cl::vector program_binaries = getProgramBinaries(std::move(program)); + auto program_binaries = kernel->get_binary(); std::lock_guard lock(_mutex); auto iter = _cached_binaries.find(program_binaries); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp index dc77442fedef47..e623c547bf691d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp @@ -121,7 +121,7 @@ class kernels_cache { uint32_t prog_id, std::shared_ptr task_executor = nullptr, const std::map& batch_headers = {}); - kernel::ptr get_kernel_from_cached_kernels(std::string id) const; + kernel::ptr get_kernel_from_cached_kernels(std::string id, const engine& e) const; std::vector get_kernels(const kernel_impl_params& params) const; void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp index 71e8571467d322..482184340171c5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp @@ -126,12 +126,12 @@ struct multi_stage_primitive : public typed_primitive_impl { this->can_share_kernels = kernels_cache.get_kernels_reuse(); } - void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { _kernels.clear(); _kernels.reserve(cached_kernel_ids.size()); for (size_t k = 0; k < cached_kernel_ids.size(); ++k) { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k])); + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e)); } this->can_share_kernels = kernels_cache.get_kernels_reuse(); } @@ -221,11 +221,12 @@ struct multi_stage_primitive : public typed_primitive_impl { void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); auto& kernel_vec = kernels.begin()->second; + auto& engine = kernels.begin()->first.get_program().get_engine(); _kernels.clear(); _kernels.resize(kernel_vec.size()); for (auto& k : kernel_vec) { auto sub_kernel_idx = k.second; - _kernels[sub_kernel_idx] = k.first; + _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 62300417735674..e19baa6b923f66 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -136,10 +136,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override { - if (is_cpu()) { - return; - } - _kernels.clear(); if (!_kernel_data.kernels.empty()) { auto compiled_kernels = kernels_cache.get_kernels(params); @@ -153,15 +149,12 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { this->can_share_kernels = kernels_cache.get_kernels_reuse(); } - void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { - if (is_cpu()) { - return; - } + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { _kernels.clear(); _kernels.reserve(cached_kernel_ids.size()); for (size_t k = 0; k < cached_kernel_ids.size(); ++k) { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k])); + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e)); } this->can_share_kernels = kernels_cache.get_kernels_reuse(); } @@ -198,7 +191,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void set_arguments_impl(typed_primitive_inst& instance) override { - if (instance.can_be_optimized() || is_cpu()) { + if (instance.can_be_optimized()) { return; } @@ -301,15 +294,14 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { - if (is_cpu()) - return; OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); auto& kernel_vec = kernels.begin()->second; + auto& engine = kernels.begin()->first.get_program().get_engine(); _kernels.clear(); _kernels.resize(kernel_vec.size()); for (auto& k : kernel_vec) { auto sub_kernel_idx = k.second; - _kernels[sub_kernel_idx] = k.first; + _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp index ce5701f2739cf2..08d254b65d7d62 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // + +#include "reorder_inst.h" #include "registry/implementation_manager.hpp" #include "intel_gpu/primitives/reorder.hpp" #include "program_node.h" diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp index 1f49d6ac4545da..3e789b54ab9e7a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp @@ -132,10 +132,10 @@ struct PrimitiveImplOCL : public cldnn::primitive_impl { } } - void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { + void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { OPENVINO_ASSERT(cached_kernel_ids.size() == _order.size()); for (size_t i = 0; i < cached_kernel_ids.size(); ++i) { - _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i]); + _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i], e); } this->can_share_kernels = kernels_cache.get_kernels_reuse(); } diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index 25e17dd000975c..3da50ef643911b 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -78,7 +78,7 @@ struct primitive_impl { virtual bool is_cpu() const { return true; } virtual bool is_onednn() const { return false; } virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0; - virtual void init_by_cached_kernels(const kernels_cache&, std::vector& cached_kernel_ids) {} + virtual void init_by_cached_kernels(const kernels_cache&, std::vector& cached_kernel_ids, const engine& e) {} virtual std::vector get_cached_kernel_ids(const kernels_cache&) { return {}; } virtual std::unique_ptr clone() const = 0; virtual std::vector> get_kernels_source() { return {}; } diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 34dd4b1b392d5a..6466cc14aaff9a 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1975,7 +1975,7 @@ void program::load(cldnn::BinaryInputBuffer& ib, std::shared_ptr cached_kernel_ids; ib >> cached_kernel_ids; - p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids); + p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids, _engine); } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp index add6f7de92c313..373577e617f4c3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_ref.cpp @@ -109,7 +109,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_Ref::SetDefault(const conv } KernelsPriority ConvolutionKernel_Ref::GetKernelsPriority(const Params& /*params*/) const { - return DONT_USE_IF_HAVE_SOMETHING_ELSE; + return FORCE_PRIORITY_1; } bool ConvolutionKernel_Ref::Validate(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 92f1a0a0d23410..c14e18aaf90e35 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -160,12 +160,8 @@ Plugin::Plugin() { set_device_name("GPU"); register_primitives(); - // Set OCL runtime which should be always available -#ifdef OV_GPU_WITH_SYCL - cldnn::device_query device_query(cldnn::engine_types::sycl, cldnn::runtime_types::ocl); -#else - cldnn::device_query device_query(cldnn::engine_types::ocl, cldnn::runtime_types::ocl); -#endif + auto rt_params = get_device_query_params(); + cldnn::device_query device_query(rt_params.first, rt_params.second); m_device_map = device_query.get_available_devices(); // Set default configs for each device diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp index 885ceb9bda4720..ac68496feb2d97 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp @@ -65,12 +65,11 @@ RemoteContextImpl::RemoteContextImpl(const std::mapget_info().dev_name << ")" << std::endl; -#ifdef OV_GPU_WITH_SYCL - const auto engine_type = cldnn::engine_types::sycl; -#else - const auto engine_type = cldnn::engine_types::ocl; -#endif - const auto runtime_type = cldnn::runtime_types::ocl; + auto rt_params = get_device_query_params(); m_device->initialize(); // Initialize associated device before use - m_engine = cldnn::engine::create(engine_type, runtime_type, m_device); + m_engine = cldnn::engine::create(rt_params.first, rt_params.second, m_device); init_properties(); diff --git a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt index 85dfec05c41195..384ee9c0bef7a5 100644 --- a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt @@ -16,15 +16,21 @@ file(GLOB LIBRARY_SOURCES_OCL "${CMAKE_CURRENT_SOURCE_DIR}/ocl/*.cpp" ) -set(LIBRARY_SOURCES_ALL - ${LIBRARY_SOURCES_MAIN} - ${LIBRARY_SOURCES_OCL} - ) - +file(GLOB LIBRARY_SOURCES_ZE + "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.cpp" +) file(GLOB_RECURSE SYCL_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/ocl/sycl_*.cpp") if(NOT OV_COMPILER_IS_INTEL_LLVM) - list(REMOVE_ITEM LIBRARY_SOURCES_ALL ${SYCL_SOURCES}) + list(REMOVE_ITEM LIBRARY_SOURCES_OCL ${SYCL_SOURCES}) +endif() + +set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN} ${LIBRARY_SOURCES_OCL}) + +if(GPU_RT_TYPE STREQUAL "L0") + list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_ZE}) endif() add_library(${TARGET_NAME} STATIC ${LIBRARY_SOURCES_ALL}) @@ -60,6 +66,7 @@ if(ENABLE_ONEDNN_FOR_GPU) endif() ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp index 6b6ce3787cf73b..8a0ee71944483c 100644 --- a/src/plugins/intel_gpu/src/runtime/device_query.cpp +++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp @@ -4,9 +4,9 @@ #include "intel_gpu/runtime/device_query.hpp" #include "ocl/ocl_device_detector.hpp" +#include "ze/ze_device_detector.hpp" #include -#include namespace cldnn { int device_query::device_id = -1; @@ -17,17 +17,22 @@ device_query::device_query(engine_types engine_type, int ctx_device_id, int target_tile_id, bool initialize_devices) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: { - if (runtime_type != runtime_types::ocl) - throw std::runtime_error("Unsupported runtime type for ocl engine"); - + switch (runtime_type) { + case runtime_types::ocl: { + OPENVINO_ASSERT(engine_type == engine_types::ocl || engine_type == engine_types::sycl); ocl::ocl_device_detector ocl_detector; _available_devices = ocl_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices); break; } - default: throw std::runtime_error("Unsupported engine type in device_query"); +#ifdef OV_GPU_WITH_ZE_RT + case runtime_types::ze: { + OPENVINO_ASSERT(engine_type == engine_types::ze); + ze::ze_device_detector ze_detector; + _available_devices = ze_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices); + break; + } +#endif + default: OPENVINO_THROW("[GPU] Unsupported engine/runtime types in device_query"); } } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index 3af715d2cccf69..3311a265de1a89 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -10,6 +10,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "ocl/ocl_engine_factory.hpp" +#include "ze/ze_engine_factory.hpp" #include #include @@ -257,6 +258,11 @@ std::shared_ptr engine::create(engine_types engine_type, runtime_ case engine_types::ocl: ret = ocl::create_ocl_engine(device, runtime_type); break; +#ifdef OV_GPU_WITH_ZE_RT + case engine_types::ze: + ret = ze::create_ze_engine(device, runtime_type); + break; +#endif default: throw std::runtime_error("Invalid engine type"); } diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp index f69a3124da7d6d..a720ee7ec4ea31 100644 --- a/src/plugins/intel_gpu/src/runtime/memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/memory.cpp @@ -50,6 +50,8 @@ std::unique_ptr surfaces_lock::create(engine_types engine_type, s case engine_types::sycl: case engine_types::ocl: return std::unique_ptr(new ocl::ocl_surfaces_lock(mem, stream)); + case engine_types::ze: + return nullptr; // TODO: implement once we have support for surface sharing default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create"); } } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 11424c180d4122..e17db33c52e77c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -341,6 +341,13 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex info.num_ccs = std::max(num_queues, info.num_ccs); } + info.supports_mutable_command_list = false; + + // Not supported + info.timer_resolution = 0; + info.kernel_timestamp_valid_bits = 0; + info.compute_queue_group_ordinal = 0; + info.device_memory_ordinal = 0; #ifdef ENABLE_ONEDNN_FOR_GPU using namespace dnnl::impl::gpu::intel::jit; @@ -423,36 +430,7 @@ bool ocl_device::is_same(const device::ptr other) { // Short path if cl_device is the same if (_platform == casted->_platform && _device.get() && casted->_device.get() && _device == casted->_device) return true; - - // Relying solely on the UUID is not reliable in all the cases (particularly on legacy platforms), - // where the UUID may be missing or incorrectly generated - // Therefore, we also validate other attributes - if (_info.uuid.uuid != casted->_info.uuid.uuid) - return false; - - if (_info.pci_info != casted->_info.pci_info) - return false; - - if (_info.sub_device_idx != casted->_info.sub_device_idx) - return false; - - if (_info.vendor_id != casted->_info.vendor_id || - _info.dev_name != casted->_info.dev_name || - _info.driver_version != casted->_info.driver_version) - return false; - - if (_info.dev_type != casted->_info.dev_type || - _info.gfx_ver != casted->_info.gfx_ver || - _info.arch != casted->_info.arch) - return false; - - if (_info.ip_version != casted->_info.ip_version || _info.device_id != casted->_info.device_id) - return false; - - if (_info.execution_units_count != casted->_info.execution_units_count || _info.max_global_mem_size != casted->_info.max_global_mem_size) - return false; - - return true; + return _info.is_same_device(casted->_info); } void ocl_device::set_mem_caps(const memory_capabilities& memory_capabilities) { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp index 83826e0a5d5e6f..c102beb8092647 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp @@ -69,20 +69,6 @@ bool does_device_match_config(const cl::Device& device) { return true; } -// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration -// Lower priority value means lower device ID -// Current behavior is: Intel iGPU < Intel dGPU < any other GPU -// Order of Intel dGPUs is undefined and depends on the OCL impl -// Order of other vendor GPUs is undefined and depends on the OCL impl -size_t get_device_priority(const cldnn::device_info& info) { - if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) { - return 0; - } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) { - return 1; - } else { - return std::numeric_limits::max(); - } -} } // namespace namespace cldnn { @@ -133,15 +119,6 @@ static std::vector getSubDevices(cl::Device& rootDevice) { return subDevices; } -std::vector ocl_device_detector::sort_devices(const std::vector& devices_list) { - std::vector sorted_list = devices_list; - std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1, device::ptr d2) { - return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info()); - }); - - return sorted_list; -} - std::map ocl_device_detector::get_available_devices(void* user_context, void* user_device, int ctx_device_id, diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp index 121fc025098b08..0992d734bf512b 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp @@ -25,8 +25,6 @@ class ocl_device_detector { int target_tile_id = -1, bool initialize_devices = false) const; - static std::vector sort_devices(const std::vector& devices_list); - private: std::vector create_device_list() const; std::vector create_device_list_from_user_context(void* user_context, int ctx_device_id = 0) const; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp new file mode 100644 index 00000000000000..417a0286df7252 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ocl_kernel.hpp" +#include + +namespace cldnn { +namespace ocl { + +std::vector ocl_kernel::get_binary() const { + // Get the corresponding program object for the kernel + cl_program program; + cl_int error = clGetKernelInfo(_compiled_kernel.get(), CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr); + if (error) { + throw std::runtime_error("Failed to retrieve CL_KERNEL_PROGRAM: " + std::to_string(error)); + } + + // Get the size of the program binary in bytes. + size_t binary_size = 0; + error = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, nullptr); + if (error) { + throw std::runtime_error("Failed to retrieve CL_PROGRAM_BINARY_SIZES: " + std::to_string(error)); + } + + // Binary is not available for the device. + if (binary_size == 0) + throw std::runtime_error("get_binary: Binary size is zero"); + + // Get program binary. + std::vector binary(binary_size); + uint8_t* binary_buffer = binary.data(); + error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary_size, &binary_buffer, nullptr); + if (error) { + throw std::runtime_error("Failed to retrieve CL_PROGRAM_BINARIES: " + std::to_string(error)); + } + + return binary; +} + +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp index 206db55057cf5e..035182f664df4d 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp @@ -33,6 +33,8 @@ class ocl_kernel : public kernel { return std::make_shared(get_handle().clone(), _kernel_id); } + + std::vector get_binary() const override; }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp new file mode 100644 index 00000000000000..51c69202678fba --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/event.hpp" +#include + +namespace cldnn { +namespace ze { + +struct ze_base_event : public event { +public: + explicit ze_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { } + uint64_t get_queue_stamp() const { return _queue_stamp; } + void set_queue_stamp(uint64_t val) { _queue_stamp = val; } + virtual ze_event_handle_t get() = 0; + +protected: + uint64_t _queue_stamp = 0; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp new file mode 100644 index 00000000000000..fddcda042d7fa0 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_common.hpp" +#include "openvino/core/except.hpp" + +#if defined(__linux__) +#include +#elif defined(_WIN32) +#include "windows.h" +#else +#error "Level Zero is supported on Linux and Windows only" +#endif + +namespace cldnn { +namespace ze { + +void *find_ze_symbol(const char *symbol) { +#if defined(__linux__) + void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL); +#elif defined(_WIN32) + HMODULE handle = LoadLibraryA("ze_loader.dll"); +#endif + if (!handle) { + return nullptr; + } + +#if defined(__linux__) + void *f = dlsym(handle, symbol); +#elif defined(_WIN32) + void *f = GetProcAddress(handle, symbol); +#endif + OPENVINO_ASSERT(f != nullptr); + return f; +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp new file mode 100644 index 00000000000000..562167458288f3 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include + +#include + +#define ZE_CHECK(f) \ + do { \ + ze_result_t res_ = (f); \ + if (res_ != ZE_RESULT_SUCCESS) { \ + throw std::runtime_error(#f " command failed with code " + std::to_string(res_)); \ + } \ + } while (false) + + +namespace cldnn { +namespace ze { + +static constexpr uint64_t default_timeout = std::numeric_limits::max(); + +void* find_ze_symbol(const char *symbol); + +template +F find_ze_symbol(const char *symbol) { + return (F)find_ze_symbol(symbol); +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp new file mode 100644 index 00000000000000..c4fa62668675fe --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -0,0 +1,273 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_device.hpp" +#include "ze_common.hpp" + +#include +#include +#include +#include + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace cldnn { +namespace ze { + +namespace { + +bool supports_extension(const std::vector& extensions, const std::string& ext_name, uint32_t ext_ver) { + return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) { + return std::string(ep.name) == ext_name && ep.version == ext_ver; + }) != extensions.end(); +} + +device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t device) { + device_info info; + + uint32_t num_ext = 0; + ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, nullptr)); + + std::vector extensions(num_ext); + ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0])); + + ze_driver_properties_t driver_properties{ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES}; + ZE_CHECK(zeDriverGetProperties(driver, &driver_properties)); + + bool supports_luid = supports_extension(extensions, ZE_DEVICE_LUID_EXT_NAME, ZE_DEVICE_LUID_EXT_VERSION_1_0); + bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0); + bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0); + bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0); + + ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0}; + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr}; + ZE_CHECK(zeDeviceGetProperties(device, &device_properties)); + + ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES}; + ZE_CHECK(zeDeviceGetComputeProperties(device, &device_compute_properties)); + + uint32_t queue_properties_count = 0; + ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr)); + + std::vector queue_properties(queue_properties_count); + for (auto& mp : queue_properties) { + mp.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; + } + + ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0])); + + auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { + return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == true; + }); + + OPENVINO_ASSERT(compute_queue_props != queue_properties.end()); + + uint32_t memory_properties_count = 0; + ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr)); + + std::vector device_memory_properties(memory_properties_count); + for (auto& mp : device_memory_properties) { + mp.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES; + } + ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0])); + + ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES}; + ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); + + auto mem_properties = std::find_if(device_memory_properties.begin(), device_memory_properties.end(), [](const ze_device_memory_properties_t& p) { + auto name = std::string(p.name); + return name == "DDR" || name == "HBM"; + }); + + ze_device_module_properties_t device_module_properties{ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES}; + ZE_CHECK(zeDeviceGetModuleProperties(device, &device_module_properties)); + + ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES}; + ZE_CHECK(zeDeviceGetImageProperties(device, &device_image_properties)); + + info.vendor_id = device_properties.vendorId; + info.dev_name = device_properties.name; + info.driver_version = std::to_string(driver_properties.driverVersion); + info.dev_type = (device_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? device_type::integrated_gpu : device_type::discrete_gpu; + + info.execution_units_count = device_properties.numEUsPerSubslice * device_properties.numSubslicesPerSlice * device_properties.numSlices; + + info.gpu_frequency = device_properties.coreClockRate; + + info.supported_simd_sizes = {}; + info.has_separate_cache = true; + + info.max_work_group_size = device_compute_properties.maxTotalGroupSize; + info.max_local_mem_size = device_compute_properties.maxSharedLocalMemory; + + if (mem_properties != device_memory_properties.end()) { + info.max_global_mem_size = mem_properties->totalSize; + info.device_memory_ordinal = std::distance(device_memory_properties.begin(), mem_properties); + } else { + info.max_global_mem_size = 0; + info.device_memory_ordinal = 0; + } + + info.max_alloc_mem_size = device_properties.maxMemAllocSize; + + info.supports_image = device_image_properties.maxSamplers > 0; + info.supports_intel_planar_yuv = false; + info.max_image2d_width = device_image_properties.maxImageDims2D; + info.max_image2d_height = device_image_properties.maxImageDims2D; + + info.supports_fp16 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP16) != 0; + info.supports_fp64 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP64) != 0; + info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0; + + info.supports_khr_subgroups = true; + info.supports_intel_subgroups = true; + info.supports_intel_subgroups_short = true; + info.supports_intel_subgroups_char = true; + info.supports_intel_required_subgroup_size = true; + + info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0; + info.supports_immad = false; // FIXME + + info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; + + info.gfx_ver = {0, 0, 0}; // could find how to retrieve this from L0 so far + info.arch = gpu_arch::unknown; + info.ip_version = ip_version_properties.ipVersion; + info.sub_device_idx = (std::numeric_limits::max)(); + + info.device_id = device_properties.deviceId; + info.num_slices = device_properties.numSlices; + info.num_sub_slices_per_slice = device_properties.numSubslicesPerSlice; + info.num_eus_per_sub_slice = device_properties.numEUsPerSubslice; + info.num_threads_per_eu = device_properties.numThreadsPerEU; + + info.num_ccs = compute_queue_props->numQueues; + info.supports_queue_families = true; + + info.kernel_timestamp_valid_bits = device_properties.kernelTimestampValidBits; + info.timer_resolution = device_properties.timerResolution; + info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props); + + static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, ""); + static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, ""); + std::copy_n(&device_properties.uuid.id[0], ZE_MAX_DEVICE_UUID_SIZE, info.uuid.uuid.begin()); + + if (supports_luid) { + ze_device_luid_ext_properties_t luid_props{ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES, nullptr}; + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &luid_props}; + if (zeDeviceGetProperties(device, &device_properties) == ZE_RESULT_SUCCESS) + std::copy_n(&luid_props.luid.id[0], ZE_MAX_DEVICE_LUID_SIZE_EXT, info.luid.luid.begin()); + } + + info.supports_mutable_command_list = false; + + if (supports_mutable_list) { + ze_mutable_command_list_exp_properties_t mutable_list_props = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_PROPERTIES, nullptr, 0, 0 }; + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &mutable_list_props}; + if (zeDeviceGetProperties(device, &device_properties) == ZE_RESULT_SUCCESS) { + ze_mutable_command_exp_flags_t required_features = ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION | + ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | + ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET | + ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT | + ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS; + + info.supports_mutable_command_list = (mutable_list_props.mutableCommandFlags & required_features) == required_features; + } + } + if (supports_pci_properties) { + ze_pci_ext_properties_t pci_properties{ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES, nullptr}; + if (zeDevicePciGetPropertiesExt(device, &pci_properties) == ZE_RESULT_SUCCESS) { + info.pci_info.pci_bus = pci_properties.address.bus; + info.pci_info.pci_device = pci_properties.address.device; + info.pci_info.pci_domain = pci_properties.address.domain; + info.pci_info.pci_function = pci_properties.address.function; + } + } + + return info; +} + +memory_capabilities init_memory_caps(ze_device_handle_t device, const device_info& info) { + std::vector memory_caps; + + ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES}; + ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); + + if (info.supports_usm) { + if (device_memory_access_properties.hostAllocCapabilities) { + memory_caps.push_back(allocation_type::usm_host); + } + if (device_memory_access_properties.sharedSingleDeviceAllocCapabilities) { + memory_caps.push_back(allocation_type::usm_shared); + } + if (device_memory_access_properties.deviceAllocCapabilities) { + memory_caps.push_back(allocation_type::usm_device); + } + } + + return memory_capabilities(memory_caps); +} + +} // namespace + + +ze_device::ze_device(ze_driver_handle_t driver, ze_device_handle_t device, bool initialize) +: _driver(driver) +, _device(device) +, _info(init_device_info(driver, device)) +, _mem_caps(init_memory_caps(device, _info)) { + if (initialize) { + this->initialize(); + } +} + +void ze_device::initialize() { + if (_is_initialized) + return; + + ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 }; + ZE_CHECK(zeContextCreate(_driver, &context_desc, &_context)); + _is_initialized = true; +} + +bool ze_device::is_initialized() const { + return _is_initialized; +} + +bool ze_device::is_same(const device::ptr other) { + auto casted = downcast(other.get()); + if (!casted) + return false; + + if (is_initialized() && casted->is_initialized()) { + // Do not compare contexts as one driver can have many different contexts + return _device == casted->get_device() && _driver == casted->get_driver(); + } + return _info.is_same_device(casted->_info); +} + +void ze_device::set_mem_caps(const memory_capabilities& memory_capabilities) { + _mem_caps = memory_capabilities; +} + +ze_device::~ze_device() { + if (_is_initialized) + zeContextDestroy(_context); +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp new file mode 100644 index 00000000000000..1a20685ed2cc77 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/device.hpp" +#include + +namespace cldnn { +namespace ze { + +struct ze_device : public device { +public: + ze_device(ze_driver_handle_t driver, ze_device_handle_t device, bool initialize = true); + + const device_info& get_info() const override { return _info; } + memory_capabilities get_mem_caps() const override { return _mem_caps; } + + void initialize() override; + bool is_initialized() const override; + + const ze_driver_handle_t get_driver() const { return _driver; } + const ze_device_handle_t get_device() const { return _device; } + const ze_context_handle_t get_context() const { return _context; } + + bool is_same(const device::ptr other) override; + void set_mem_caps(const memory_capabilities& memory_capabilities) override; + + ~ze_device(); + +private: + ze_driver_handle_t _driver = nullptr; + ze_device_handle_t _device = nullptr; + ze_context_handle_t _context = nullptr; + bool _is_initialized = false; + + device_info _info; + memory_capabilities _mem_caps; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp new file mode 100644 index 00000000000000..4fede32a322a73 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_device_detector.hpp" +#include "ze_device.hpp" +#include "ze_common.hpp" +#include +#include "intel_gpu/runtime/debug_configuration.hpp" +#include "openvino/core/except.hpp" + +#include + +namespace cldnn { +namespace ze { + +static std::vector get_sub_devices(ze_device_handle_t root_device) { + uint32_t n_subdevices = 0; + ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr)); + if (n_subdevices == 0) + return {}; + + std::vector subdevices(n_subdevices); + + ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0])); + + return subdevices; +} + +std::map ze_device_detector::get_available_devices(void* user_context, + void* user_device, + int ctx_device_id, + int target_tile_id, + bool initialize_devices) const { + std::vector devices_list; + if (user_context != nullptr) { + devices_list = create_device_list_from_user_context(user_context, ctx_device_id); + } else if (user_device != nullptr) { + devices_list = create_device_list_from_user_device(user_device); + } else { + devices_list = create_device_list(initialize_devices); + } + + devices_list = sort_devices(devices_list); + + std::map ret; + uint32_t idx = 0; + for (auto& dptr : devices_list) { + auto map_id = std::to_string(idx++); + ret[map_id] = dptr; + + auto root_device = std::dynamic_pointer_cast(dptr); + OPENVINO_ASSERT(root_device != nullptr, "[GPU] Invalid device type created in ocl_device_detector"); + + auto sub_devices = get_sub_devices(root_device->get_device()); + if (!sub_devices.empty()) { + uint32_t sub_idx = 0; + for (auto& sub_device : sub_devices) { + if (target_tile_id != -1 && static_cast(sub_idx) != target_tile_id) { + sub_idx++; + continue; + } + auto sub_device_ptr = std::make_shared(root_device->get_driver(), sub_device, initialize_devices); + ret[map_id + "." + std::to_string(sub_idx++)] = sub_device_ptr; + } + } + } + + return ret; +} + +std::vector ze_device_detector::create_device_list(bool initialize_devices) const { + std::vector ret; + + ZE_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)); + + uint32_t driver_count = 0; + ZE_CHECK(zeDriverGet(&driver_count, nullptr)); + + std::vector all_drivers(driver_count); + ZE_CHECK(zeDriverGet(&driver_count, &all_drivers[0])); + + for (uint32_t i = 0; i < driver_count; ++i) { + uint32_t device_count = 0; + ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, nullptr)); + + std::vector all_devices(device_count); + ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0])); + + for (uint32_t d = 0; d < device_count; ++d) { + try { + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + ZE_CHECK(zeDeviceGetProperties(all_devices[d], &device_properties)); + + if (ZE_DEVICE_TYPE_GPU == device_properties.type) { + ret.emplace_back(std::make_shared(all_drivers[i], all_devices[d], initialize_devices)); + } + } catch (std::exception& ex) { + GPU_DEBUG_LOG << "Devices query/creation failed for driver " << i << ex.what() << std::endl; + GPU_DEBUG_LOG << "Platform is skipped" << std::endl; + continue; + } + } + } + + return ret; +} + +std::vector ze_device_detector::create_device_list_from_user_context(void* user_context, int ctx_device_id) const { + OPENVINO_NOT_IMPLEMENTED; +} + +std::vector ze_device_detector::create_device_list_from_user_device(void* user_device) const { + OPENVINO_NOT_IMPLEMENTED; +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp new file mode 100644 index 00000000000000..deeefb36234a59 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/device.hpp" + +#include +#include +#include + +namespace cldnn { +namespace ze { + +class ze_device_detector { +public: + ze_device_detector() = default; + + std::map get_available_devices(void* user_context, + void* user_device, + int ctx_device_id, + int target_tile_id, + bool initialize_devices = false) const; +private: + std::vector create_device_list(bool initialize_devices) const; + std::vector create_device_list_from_user_context(void* user_context, int ctx_device_id = 0) const; + std::vector create_device_list_from_user_device(void* user_device) const; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp new file mode 100644 index 00000000000000..a493ac23005ab0 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -0,0 +1,266 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_engine.hpp" +#include "intel_gpu/runtime/utils.hpp" +#include "openvino/core/except.hpp" +#include "ze/ze_kernel.hpp" +#include "ze_api.h" +#include "ze_engine_factory.hpp" +#include "ze_common.hpp" +#include "ze_memory.hpp" +#include "ze_stream.hpp" +#include "ze_device.hpp" +#include +#include +#include +#include + + +namespace cldnn { +namespace ze { + +namespace { + +void func_zeModuleCreate(ze_context_handle_t hContext, + ze_device_handle_t hDevice, const ze_module_desc_t *desc, + ze_module_handle_t *phModule, + ze_module_build_log_handle_t *phBuildLog) { + static auto f = find_ze_symbol("zeModuleCreate"); + + if (!f) + throw std::runtime_error("zeModuleCreate was not found"); + ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog)); +} + +ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& engine, std::vector binary) { + auto desc = ze_module_desc_t(); + desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; + desc.format = ZE_MODULE_FORMAT_NATIVE; + desc.inputSize = binary.size(); + desc.pInputModule = binary.data(); + desc.pBuildFlags = ""; + desc.pConstants = nullptr; + + ze_module_handle_t ze_module; + + auto ze_device = engine.get_device(); + auto ze_ctx = engine.get_context(); + func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr); + return ze_module; +} + +} // namespace + +ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type) + : engine(dev) { + OPENVINO_ASSERT(runtime_type == runtime_types::ze, "[GPU] Invalid runtime type specified for ZE engine. Only ZE runtime is supported"); + + auto casted = dynamic_cast(dev.get()); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type passed to ze engine"); + + _service_stream.reset(new ze_stream(*this, ExecutionConfig())); +} + +#ifdef ENABLE_ONEDNN_FOR_GPU +void ze_engine::create_onednn_engine(const ExecutionConfig& config) { + OPENVINO_NOT_IMPLEMENTED; +} + +dnnl::engine& ze_engine::get_onednn_engine() const { + OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called"); + return *_onednn_engine; +} +#endif + +const ze_driver_handle_t ze_engine::get_driver() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return casted->get_driver(); +} + +const ze_context_handle_t ze_engine::get_context() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return casted->get_context(); +} + +const ze_device_handle_t ze_engine::get_device() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return casted->get_device(); +} + +allocation_type ze_engine::detect_usm_allocation_type(const void* memory) const { + return ze::gpu_usm::detect_allocation_type(this, memory); +} + +bool ze_engine::check_allocatable(const layout& layout, allocation_type type) { + OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type); + + bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size); + + // When dynamic shape upper bound makes bigger buffer, then return false. + if (exceed_allocatable_mem_size && layout.is_dynamic()) { + OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); + return false; + } + + OPENVINO_ASSERT(!exceed_allocatable_mem_size, + "[GPU] Exceeded max size of memory object allocation: ", + "requested ", layout.bytes_count(), " bytes, " + "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.", + "Please try to reduce batch size or use lower precision."); + + auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); + auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size()); + + // When dynamic shape upper bound makes bigger buffer, then return false. + if (exceed_available_mem_size && layout.is_dynamic()) { + OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); + return false; + } + +#ifdef __unix__ + // Prevent from being killed by Ooo Killer of Linux + OPENVINO_ASSERT(!exceed_available_mem_size, + "[GPU] Exceeded max size of memory allocation: ", + "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", + "but available memory size is ", get_max_memory_size(), " bytes"); +#else + if (exceed_available_mem_size) { + GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " + << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; + GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; + return false; + } +#endif + + return true; +} + +memory::ptr ze_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { + OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); + + check_allocatable(layout, type); + + try { + memory::ptr res = std::make_shared(this, layout, type); + + if (reset || res->is_memory_reset_needed(layout)) { + auto ev = res->fill(get_service_stream()); + if (ev) { + get_service_stream().wait_for_events({ev}); + } + } + + return res; + } catch (const std::exception& e) { + OPENVINO_THROW("[GPU] Failed to allocate memory: ", e.what()); + } +} + +memory::ptr ze_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) { + OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine"); + OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(), + "[GPU] trying to reinterpret between image and non-image layouts. Current: ", + memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string()); + + if (memory_capabilities::is_usm_type(memory.get_allocation_type())) { + return std::make_shared(this, + new_layout, + reinterpret_cast(memory).get_buffer(), + memory.get_allocation_type(), + memory.get_mem_tracker()); + } + + return nullptr; +} + +memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_params params) { + if (params.mem_type == shared_mem_type::shared_mem_usm) { + ze::UsmMemory usm_buffer(get_context(), get_device(), params.mem); + size_t actual_mem_size = 0; + zeMemGetAddressRange(get_context(), params.mem, nullptr, &actual_mem_size); + auto requested_mem_size = new_layout.bytes_count(); + OPENVINO_ASSERT(actual_mem_size >= requested_mem_size, + "[GPU] shared USM buffer has smaller size (", actual_mem_size, + ") than specified layout (", requested_mem_size, ")"); + return std::make_shared(this, new_layout, usm_buffer, nullptr); + } else { + return nullptr; + } +} + +memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) { + OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] Trying to create a subbuffer from a buffer allocated by a different engine"); + if (new_layout.format.is_image_2d()) { + OPENVINO_NOT_IMPLEMENTED; + } + if (memory_capabilities::is_usm_type(memory.get_allocation_type())) { + auto& new_buf = reinterpret_cast(memory); + auto ptr = new_buf.get_buffer().get(); + auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset); + return std::make_shared(this, + new_layout, + sub_buffer, + memory.get_allocation_type(), + memory.get_mem_tracker()); + } else { + OPENVINO_THROW("[GPU] Trying to create subbuffer for non usm memory"); + } +} + +bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) { + if (mem1.get_engine() != this || mem2.get_engine() != this) + return false; + if (mem1.get_allocation_type() != mem2.get_allocation_type()) + return false; + if (&mem1 == &mem2) + return true; + + return (reinterpret_cast(mem1).get_buffer().get() == reinterpret_cast(mem2).get_buffer().get()); +} + +kernel::ptr ze_engine::prepare_kernel(const kernel::ptr kernel) const { + if (std::dynamic_pointer_cast(kernel)) { + return kernel; + } else { + auto binary = kernel->get_binary(); + ze_module_handle_t ze_module = ze_create_module_with_level_zero(*this, binary); + ze_kernel_handle_t ze_kernel; + auto entry_point = kernel->get_id(); + ze_kernel_desc_t desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC , nullptr, 0, entry_point.c_str()}; + zeKernelCreate(ze_module, &desc, &ze_kernel); + return std::make_shared(ze_kernel, ze_module, entry_point); + } +} + +void* ze_engine::get_user_context() const { + auto& casted = downcast(*_device); + return static_cast(casted.get_driver()); +} + +stream::ptr ze_engine::create_stream(const ExecutionConfig& config) const { + return std::make_shared(*this, config); +} + +stream::ptr ze_engine::create_stream(const ExecutionConfig& config, void* handle) const { + OPENVINO_NOT_IMPLEMENTED; +} + +stream& ze_engine::get_service_stream() const { + return *_service_stream; +} + +std::shared_ptr ze_engine::create(const device::ptr device, runtime_types runtime_type) { + return std::make_shared(device, runtime_type); +} + +std::shared_ptr create_ze_engine(const device::ptr device, runtime_types runtime_type) { + return ze_engine::create(device, runtime_type); +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp new file mode 100644 index 00000000000000..b75d2ae0ca67eb --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/runtime/engine.hpp" +#include "intel_gpu/runtime/stream.hpp" +#include "intel_gpu/runtime/device.hpp" + +#include + +namespace cldnn { +namespace ze { + +class ze_engine : public engine { +public: + ze_engine(const device::ptr dev, runtime_types runtime_type); + engine_types type() const override { return engine_types::ze; }; + runtime_types runtime_type() const override { return runtime_types::ze; }; + + memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override; + memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override; + memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) override; + memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; + bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; + bool check_allocatable(const layout& layout, allocation_type type) override; + + void* get_user_context() const override; + + allocation_type get_default_allocation_type() const override { return allocation_type::usm_device; } + allocation_type detect_usm_allocation_type(const void* memory) const override; + + const ze_context_handle_t get_context() const; + const ze_driver_handle_t get_driver() const; + const ze_device_handle_t get_device() const; + + stream_ptr create_stream(const ExecutionConfig& config) const override; + stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; + stream& get_service_stream() const override; + + kernel::ptr prepare_kernel(const kernel::ptr kernel) const override; + +#ifdef ENABLE_ONEDNN_FOR_GPU + void create_onednn_engine(const ExecutionConfig& config) override; + // Returns onednn engine object which shares device and context with current engine + dnnl::engine& get_onednn_engine() const override; +#endif + + static std::shared_ptr create(const device::ptr device, runtime_types runtime_type); + +private: + std::unique_ptr _service_stream; + +#ifdef ENABLE_ONEDNN_FOR_GPU + std::mutex onednn_mutex; + std::shared_ptr _onednn_engine; +#endif +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp new file mode 100644 index 00000000000000..40c944a1ca7512 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/device.hpp" +#include "intel_gpu/runtime/engine.hpp" + +namespace cldnn { +namespace ze { + +// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible CL includes conflict +// between different engines in engine.cpp file +std::shared_ptr create_ze_engine(const device::ptr device, runtime_types runtime_type); + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp new file mode 100644 index 00000000000000..c5e39be435798f --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -0,0 +1,216 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_event.hpp" +#include "ze/ze_common.hpp" + +#include +#include +#include + +using namespace cldnn; +using namespace ze; + +namespace { + +std::chrono::nanoseconds timestamp_to_duration(const device_info& device_info, const ze_kernel_timestamp_data_t& timestamp) { + constexpr double NS_IN_SEC = 1000000000.0; + const double timestamp_freq = NS_IN_SEC / device_info.timer_resolution; + const uint64_t timestamp_max_value = ~(-1L << device_info.kernel_timestamp_valid_bits); + + auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ? (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq + : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq; + + return std::chrono::nanoseconds(static_cast(d)); +} + +} // namespace + +void ze_event::wait_impl() { + if (m_event != nullptr) { + ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout)); + } +} + +void ze_event::set_impl() { + if (m_event != nullptr) { + ZE_CHECK(zeEventHostSignal(m_event)); + } +} + +bool ze_event::is_set_impl() { + if (m_event != nullptr) { + return zeEventQueryStatus(m_event) == ZE_RESULT_SUCCESS; + } + return true; +} + +bool ze_event::is_profiled() const { + if (m_event != nullptr) { + ze_event_pool_flags_t event_pool_flags; + auto ev_pool = m_event_pool.get()->m_handle; + ZE_CHECK(zeEventPoolGetFlags(ev_pool, &event_pool_flags)); + return (event_pool_flags & ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) != 0; + } + return false; +} + +bool ze_event::get_profiling_info_impl(std::list& info) { + if (!is_profiled()) { + return true; + } + + const auto& engine = m_event_pool->m_engine; + auto device_info = engine.get_device_info(); + + ze_kernel_timestamp_result_t timestamp{}; + ZE_CHECK(zeEventQueryKernelTimestamp(m_event, ×tamp)); + + auto wallclock_time = timestamp_to_duration(device_info, timestamp.global); + auto exec_time = timestamp_to_duration(device_info, timestamp.context); + + auto period_exec = std::make_shared(timestamp_to_duration(device_info, timestamp.context)); + auto period_submit = std::make_shared(wallclock_time - exec_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} + +void ze_events::wait_impl() { + if (_last_ze_event != nullptr) { + ZE_CHECK(zeEventHostSynchronize(_last_ze_event, UINT32_MAX)); + } +} + +void ze_events::set_impl() { + wait_impl(); +} + +bool ze_events::is_set_impl() { + if (_last_ze_event != nullptr) { + return zeEventQueryStatus(_last_ze_event) == ZE_RESULT_SUCCESS; + } + return true; +} + +bool ze_events::get_profiling_info_impl(std::list& info) { + // The goal is to sum up all disjoint durations of its projection on the time axis + std::vector all_global_timestamps; + std::vector all_context_timestamps; + + auto add_or_merge = [](std::vector& all_timestamps, const ze_kernel_timestamp_data_t& ts) { + auto it = all_timestamps.begin(); + bool merged = false; + auto target_timestamp = ts; + while (it != all_timestamps.end()) { + auto& timestamp = *it; + bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd; + bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart; + if (!disjoint) { + if (equal) { + if (!merged) { + merged = true; + break; + } else { + it = all_timestamps.erase(it); + } + } else { + if (!merged) { + timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart); + timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd); + target_timestamp = timestamp; + merged = true; + it++; + } else { + if (timestamp.kernelEnd > target_timestamp.kernelEnd) { + it--; + it->kernelEnd = target_timestamp.kernelEnd; + it++; + } + it = all_timestamps.erase(it); + } + } + } else { + it++; + } + } + + if (!merged) { + all_timestamps.push_back(target_timestamp); + } + }; + + if (_events.empty()) + return false; + + const auto& engine = downcast(_events.front().get())->m_event_pool->m_engine; + auto device_info = engine.get_device_info(); + + auto get_total_exec_time = [&device_info](std::vector& all_timestamps) { + std::chrono::nanoseconds total_time{0}; + for (const auto& ts : all_timestamps) { + total_time += timestamp_to_duration(device_info, ts); + } + + return total_time; + }; + + // Submission time is calculated as difference between merged context and wallclock intervals + // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals + using intervals_t = std::vector; + auto get_submission_time = [&device_info](const intervals_t& s_timestamps, + const intervals_t& e_timestamps) { + auto get_minmax = [](const intervals_t& timestamps) { + uint64_t min_val = std::min(timestamps.begin(), timestamps.end(), + [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { + return lhs->kernelStart < rhs->kernelStart; + })->kernelStart; + uint64_t max_val = std::max(timestamps.begin(), timestamps.end(), + [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { + return lhs->kernelEnd < rhs->kernelEnd; + })->kernelEnd; + + return ze_kernel_timestamp_data_t{min_val, max_val}; + }; + + auto submission_interval = get_minmax(s_timestamps); + auto exec_interval = get_minmax(e_timestamps); + + auto wallclock_time = timestamp_to_duration(device_info, submission_interval); + auto exec_time = timestamp_to_duration(device_info, exec_interval); + + return wallclock_time - exec_time; + }; + + for (size_t i = 0; i < _events.size(); i++) { + auto be = downcast(_events[i].get()); + if (!be->is_profiled()) { + continue; + } + ze_kernel_timestamp_result_t timestamp{}; + ZE_CHECK(zeEventQueryKernelTimestamp(be->get(), ×tamp)); + + add_or_merge(all_global_timestamps, timestamp.global); + add_or_merge(all_context_timestamps, timestamp.context); + } + + auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps); + auto exec_time = get_total_exec_time(all_context_timestamps); + + auto period_exec = std::make_shared(exec_time); + auto period_submit = std::make_shared(submit_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} + +ze_event::~ze_event() { + if (m_event != nullptr) { + zeEventDestroy(m_event); + } +} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp new file mode 100644 index 00000000000000..5120cf9120ec29 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp @@ -0,0 +1,94 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" +#include "ze_event_pool.hpp" + +#include +#include + +namespace cldnn { +namespace ze { + +struct ze_event : public ze_base_event { +public: + ze_event(ze_event_pool::ptr ev_pool, ze_event_handle_t ev, uint64_t queue_stamp = 0) + : ze_base_event(queue_stamp) + , m_event_pool(ev_pool) + , m_event(ev) {} + + ze_event_handle_t get() override { return m_event; } + bool is_profiled() const; + + ~ze_event(); + +private: + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + bool get_profiling_info_impl(std::list& info) override; + + friend struct ze_events; + +protected: + ze_event_pool::ptr m_event_pool; + ze_event_handle_t m_event; +}; + +struct ze_events : public ze_base_event { +public: + ze_events(std::vector const& ev) + : ze_base_event(0) { + process_events(ev); + } + + ze_event_handle_t get() override { return _last_ze_event; } + + void reset() override { + event::reset(); + _events.clear(); + } + +private: + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + + void process_events(const std::vector& ev) { + for (size_t i = 0; i < ev.size(); i++) { + auto multiple_events = dynamic_cast(ev[i].get()); + if (multiple_events) { + for (size_t j = 0; j < multiple_events->_events.size(); j++) { + if (auto base_ev = dynamic_cast(multiple_events->_events[j].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) { + _queue_stamp = current_ev_queue_stamp; + _last_ze_event = base_ev->get(); + } + } + _events.push_back(multiple_events->_events[j]); + } + } else { + if (auto base_ev = dynamic_cast(ev[i].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) { + _queue_stamp = current_ev_queue_stamp; + _last_ze_event = base_ev->get(); + } + } + _events.push_back(ev[i]); + } + } + } + + bool get_profiling_info_impl(std::list& info) override; + + ze_event_handle_t _last_ze_event; + std::vector _events; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp new file mode 100644 index 00000000000000..1ae696e0d10824 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_event_pool.hpp" +#include "ze_event.hpp" +#include "ze_common.hpp" + +namespace cldnn { +namespace ze { + +ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags) + : m_engine(engine) { + ze_event_pool_desc_t event_pool_desc = { + ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + nullptr, + flags, + capacity + }; + auto device = engine.get_device(); + ZE_CHECK(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle)); +} + +ze_event_pool::~ze_event_pool() { + zeEventPoolDestroy(m_handle); +} + +ze_events_pool::ze_events_pool(const ze_engine& engine, bool enable_profiling) + : m_engine(engine) + , m_enable_profiling(enable_profiling) { } + +std::shared_ptr ze_events_pool::create_event(uint64_t queue_stamp) { + if (m_num_used >= m_capacity || !m_current_pool) { + m_num_used = 0; + ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; + flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + m_current_pool = std::make_shared(m_engine, m_capacity, flags); + } + + ze_event_handle_t event; + // set signal scope to host to allow wait on the host + // TODO: avoid setting scope when event is not used for wait on the host + ze_event_desc_t event_desc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + m_num_used++, + ZE_EVENT_SCOPE_FLAG_HOST, + 0 + }; + ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event)); + + return std::make_shared(m_current_pool, event, queue_stamp); +} + +std::shared_ptr ze_events_pool::create_user_event() { + if (m_num_used_user >= m_capacity || !m_current_user_pool) { + m_num_used_user = 0; + ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; + flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + m_current_user_pool = std::make_shared(m_engine, m_capacity, flags); + } + // set signal scope to host to allow wait on the host + // TODO: avoid setting scope when event is not used for wait on the host + ze_event_handle_t event; + ze_event_desc_t event_desc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + m_num_used_user++, + ZE_EVENT_SCOPE_FLAG_HOST, + 0 + }; + ZE_CHECK(zeEventCreate(m_current_user_pool->m_handle, &event_desc, &event)); + + return std::make_shared(m_current_user_pool, event); +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp new file mode 100644 index 00000000000000..197e41b862a422 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_engine.hpp" + +namespace cldnn { +namespace ze { + +struct ze_event; + +// Wrapper for ze events pool which is needed to track lifetime of the pool. +// I.e. the object is destoyed if no ze_events alive which refer to this pool +// and ze_events_pool doesn't refer to it as well +struct ze_event_pool { + ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags); + ~ze_event_pool(); + using ptr = std::shared_ptr; + + ze_event_pool_handle_t m_handle; + const ze_engine& m_engine; +}; + +// Helper for events pool management +// Can hold multiple ze_event_pool objects and track their capacity with realloc when it's needed +struct ze_events_pool { +public: + ze_events_pool(const ze_engine& engine, bool enable_profiling); + + std::shared_ptr create_event(uint64_t queue_stamp = 0); + std::shared_ptr create_user_event(); + +protected: + const ze_engine& m_engine; + std::shared_ptr m_current_user_pool = nullptr; + std::shared_ptr m_current_pool = nullptr; + const uint32_t m_capacity = 100; + uint32_t m_num_used = 0; + uint32_t m_num_used_user = 0; + const bool m_enable_profiling; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp new file mode 100644 index 00000000000000..1d9118c4d12d7d --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -0,0 +1,62 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/kernel.hpp" +#include "openvino/core/except.hpp" +#include "ze_common.hpp" + +#include + +namespace cldnn { +namespace ze { + +class ze_kernel : public kernel { + ze_kernel_handle_t _compiled_kernel; + ze_module_handle_t _module; + std::string _kernel_id; + +public: + ze_kernel(ze_kernel_handle_t compiled_kernel, ze_module_handle_t module, const std::string& kernel_id) + : _compiled_kernel(compiled_kernel) + , _module(module) + , _kernel_id(kernel_id) { } + + ~ze_kernel() { + zeKernelDestroy(_compiled_kernel); + } + + const ze_kernel_handle_t& get_handle() const { return _compiled_kernel; } + ze_kernel_handle_t& get_handle() { return _compiled_kernel; } + std::shared_ptr clone(bool reuse_kernel_handle = false) const override { + if (reuse_kernel_handle) { + return std::make_shared(_compiled_kernel, _module, _kernel_id); + } else { + ze_kernel_handle_t cloned_handle; + ze_kernel_desc_t descriptor; + descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; + descriptor.pNext = nullptr; + descriptor.flags = 0; + descriptor.pKernelName = _kernel_id.c_str(); + ZE_CHECK(zeKernelCreate(_module, &descriptor, &cloned_handle)); + return std::make_shared(cloned_handle, _module, _kernel_id); + } + } + + std::string get_id() const override { return _kernel_id; } + + std::vector get_binary() const override { + size_t binary_size = 0; + ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, nullptr)); + + std::vector binary(binary_size); + ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, &binary[0])); + + return binary; + } +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp new file mode 100644 index 00000000000000..3b65bf73e3c2a7 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -0,0 +1,236 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/runtime/utils.hpp" +#include "ze_memory.hpp" +#include "ze/ze_common.hpp" +#include "ze_engine.hpp" +#include "ze_stream.hpp" +#include "ze_event.hpp" +#include +#include + +namespace cldnn { +namespace ze { +namespace { +static inline cldnn::event::ptr create_event(stream& stream, size_t bytes_count) { + if (bytes_count == 0) { + GPU_DEBUG_TRACE_DETAIL << "Skip memory operation for 0 size tensor" << std::endl; + return stream.create_user_event(true); + } + + return stream.create_base_event(); +} + +} // namespace + +allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) { + ze_memory_allocation_properties_t props{ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES}; + ze_device_handle_t device = nullptr; + ZE_CHECK(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device)); + + switch (props.type) { + case ZE_MEMORY_TYPE_DEVICE: return allocation_type::usm_device; + case ZE_MEMORY_TYPE_HOST: return allocation_type::usm_host; + case ZE_MEMORY_TYPE_SHARED: return allocation_type::usm_shared; + default: return allocation_type::unknown; + } + + return allocation_type::unknown; +} + +allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer) { + auto alloc_type = detect_allocation_type(engine, buffer.get()); + OPENVINO_ASSERT(alloc_type == allocation_type::usm_device || + alloc_type == allocation_type::usm_host || + alloc_type == allocation_type::usm_shared, "[GPU] Unsupported USM alloc type: " + to_string(alloc_type)); + return alloc_type; +} + +gpu_usm::gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& buffer, allocation_type type, std::shared_ptr mem_tracker) + : lockable_gpu_mem() + , memory(engine, new_layout, type, mem_tracker) + , _buffer(buffer) + , _host_buffer(engine->get_context(), engine->get_device()) { +} + +gpu_usm::gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& buffer, std::shared_ptr mem_tracker) + : lockable_gpu_mem() + , memory(engine, new_layout, detect_allocation_type(engine, buffer), mem_tracker) + , _buffer(buffer) + , _host_buffer(engine->get_context(), engine->get_device()) { +} + +gpu_usm::gpu_usm(ze_engine* engine, const layout& layout, allocation_type type) + : lockable_gpu_mem() + , memory(engine, layout, type, nullptr) + , _buffer(engine->get_context(), engine->get_device()) + , _host_buffer(engine->get_context(), engine->get_device()) { + auto mem_ordinal = engine->get_device_info().device_memory_ordinal; + switch (get_allocation_type()) { + case allocation_type::usm_host: + _buffer.allocateHost(_bytes_count); + break; + case allocation_type::usm_shared: + _buffer.allocateShared(_bytes_count, mem_ordinal); + break; + case allocation_type::usm_device: + _buffer.allocateDevice(_bytes_count, mem_ordinal); + break; + default: + OPENVINO_THROW("[GPU] Unknown unified shared memory type!"); + } + + m_mem_tracker = std::make_shared(engine, _buffer.get(), layout.bytes_count(), type); +} + +void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) { + std::lock_guard locker(_mutex); + if (0 == _lock_count) { + auto& _ze_stream = downcast(stream); + if (get_allocation_type() == allocation_type::usm_device) { + if (type != mem_lock_type::read) { + throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type."); + } + GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl; + _host_buffer.allocateHost(_bytes_count); + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(), + _host_buffer.get(), + _buffer.get(), + _bytes_count, + nullptr, + 0, + nullptr)); + ZE_CHECK(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout)); + _mapped_ptr = _host_buffer.get(); + } else { + _mapped_ptr = _buffer.get(); + } + } + _lock_count++; + return _mapped_ptr; +} + +void gpu_usm::unlock(const stream& /* stream */) { + std::lock_guard locker(_mutex); + _lock_count--; + if (0 == _lock_count) { + if (get_allocation_type() == allocation_type::usm_device) { + _host_buffer.freeMem(); + } + _mapped_ptr = nullptr; + } +} + +event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) { + auto& _ze_stream = downcast(stream); + auto ev = _ze_stream.create_base_event(); + auto ev_ze = downcast(ev.get())->get(); + std::vector temp_buffer(_bytes_count, pattern); + ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), _buffer.get(), temp_buffer.data(), 1, _bytes_count, ev_ze, 0, nullptr)); + + if (blocking) { + ev->wait(); + } + return ev; +} + +event::ptr gpu_usm::fill(stream& stream, bool blocking) { + return fill(stream, 0, blocking); +} + +event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) { + auto result_event = create_event(stream, size); + if (size == 0) + return result_event; + + auto _ze_stream = downcast(&stream); + auto _ze_event = downcast(result_event.get())->get(); + auto src_ptr = reinterpret_cast(data_ptr) + src_offset; + auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; + + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + dst_ptr, + src_ptr, + _bytes_count, + _ze_event, + 0, + nullptr)); + + if (blocking) { + result_event->wait(); + } + + return result_event; +} + +event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) { + auto result_event = create_event(stream, size); + if (size == 0) + return result_event; + + auto _ze_stream = downcast(&stream); + auto _ze_event = downcast(result_event.get())->get(); + OPENVINO_ASSERT(memory_capabilities::is_usm_type(src_mem.get_allocation_type())); + + auto usm_mem = downcast(&src_mem); + auto src_ptr = reinterpret_cast(usm_mem->buffer_ptr()) + src_offset; + auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; + + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + dst_ptr, + src_ptr, + _bytes_count, + _ze_event, + 0, + nullptr)); + if (blocking) { + result_event->wait(); + } + + return result_event; +} + +event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const { + auto result_event = create_event(stream, size); + if (size == 0) + return result_event; + + auto _ze_stream = downcast(&stream); + auto _ze_event = downcast(result_event.get())->get(); + auto src_ptr = reinterpret_cast(buffer_ptr()) + src_offset; + auto dst_ptr = reinterpret_cast(data_ptr) + dst_offset; + + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + dst_ptr, + src_ptr, + _bytes_count, + _ze_event, + 0, + nullptr)); + if (blocking) { + result_event->wait(); + } + + return result_event; +} + +shared_mem_params gpu_usm::get_internal_params() const { + auto casted = downcast(_engine); + return { + shared_mem_type::shared_mem_usm, // shared_mem_type + static_cast(casted->get_context()), // context handle + static_cast(casted->get_device()), // user_device handle + static_cast(_buffer.get()), // mem handle +#ifdef _WIN32 + nullptr, // surface handle +#else + 0, // surface handle +#endif + 0 // plane + }; +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp new file mode 100644 index 00000000000000..b1914de00bec49 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp @@ -0,0 +1,153 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_common.hpp" +#include "ze_engine.hpp" +#include "intel_gpu/runtime/memory.hpp" + +#include +#include +#include + +namespace cldnn { +namespace ze { +struct lockable_gpu_mem { + lockable_gpu_mem() : + _lock_count(0), + _mapped_ptr(nullptr) {} + + std::mutex _mutex; + unsigned _lock_count; + void* _mapped_ptr; +}; + +class UsmHolder { +public: + UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) { } + void* ptr() { return _ptr; } + void memFree() { + try { + if (!_shared_memory) + zeMemFree(_context, _ptr); + } catch (...) { + // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly + } + _ptr = nullptr; + } + + ~UsmHolder() { + memFree(); + } +private: + ze_context_handle_t _context; + void* _ptr; + bool _shared_memory = false; +}; + +class UsmMemory { +public: + explicit UsmMemory(ze_context_handle_t context, ze_device_handle_t device) + : _context(context) + , _device(device) {} + + UsmMemory(ze_context_handle_t context, ze_device_handle_t device, void* usm_ptr, size_t offset = 0) + : _context(context) + , _device(device) + , _usm_pointer(std::make_shared(_context, reinterpret_cast(usm_ptr) + offset, true)) {} + + // Get methods returns original pointer allocated by openCL. + void* get() const { return _usm_pointer->ptr(); } + + void allocateHost(size_t size) { + ze_host_mem_alloc_desc_t host_desc = {}; + host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_desc.flags = 0; + host_desc.pNext = nullptr; + + void* memory = nullptr; + ZE_CHECK(zeMemAllocHost(_context, &host_desc, size, 1, &memory)); + _allocate(memory); + } + + void allocateShared(size_t size, uint32_t ordinal) { + ze_device_mem_alloc_desc_t device_desc = {}; + device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + device_desc.flags = 0; + device_desc.ordinal = ordinal; + device_desc.pNext = nullptr; + + ze_host_mem_alloc_desc_t host_desc = {}; + host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_desc.flags = 0; + host_desc.pNext = nullptr; + + void* memory = nullptr; + ZE_CHECK(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory)); + _allocate(memory); + } + + void allocateDevice(size_t size, uint32_t ordinal) { + ze_device_mem_alloc_desc_t device_desc = {}; + device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + device_desc.flags = 0; + device_desc.ordinal = ordinal; + device_desc.pNext = nullptr; + + void* memory = nullptr; + ZE_CHECK(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory)); + _allocate(memory); + } + + void freeMem() { + if (!_usm_pointer) + throw std::runtime_error("[CL ext] Can not free memory of empty UsmHolder"); + _usm_pointer->memFree(); + } + + virtual ~UsmMemory() = default; + +protected: + ze_context_handle_t _context; + ze_device_handle_t _device; + std::shared_ptr _usm_pointer = nullptr; + +private: + void _allocate(void* ptr) { + if (!ptr) + throw std::runtime_error("[CL ext] Can not allocate nullptr for USM type."); + _usm_pointer = std::make_shared(_context, ptr); + } +}; + +struct gpu_usm : public lockable_gpu_mem, public memory { + gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& usm_buffer, allocation_type type, std::shared_ptr mem_tracker); + gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& usm_buffer, std::shared_ptr mem_tracker); + gpu_usm(ze_engine* engine, const layout& layout, allocation_type type); + + void* lock(const stream& stream, mem_lock_type type) override; + void unlock(const stream& stream) override; + const ze::UsmMemory& get_buffer() const { return _buffer; } + ze::UsmMemory& get_buffer() { return _buffer; } + + event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override; + event::ptr fill(stream& stream, bool blocking = true) override; + shared_mem_params get_internal_params() const override; + void* buffer_ptr() const override { return _buffer.get(); } + + event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; + event::ptr copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; + event::ptr copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const override; + + static allocation_type detect_allocation_type(const ze_engine* engine, const void* mem_ptr); + static allocation_type detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer); + +protected: + ze::UsmMemory _buffer; + ze::UsmMemory _host_buffer; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp new file mode 100644 index 00000000000000..c7341c9d30b5a5 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -0,0 +1,348 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_stream.hpp" +#include "intel_gpu/runtime/memory_caps.hpp" +#include "intel_gpu/runtime/utils.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/properties.hpp" +#include "ze_event_pool.hpp" +#include "ze_event.hpp" +#include "ze_kernel.hpp" +#include "ze_memory.hpp" +#include "ze_common.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +namespace cldnn { +namespace ze { + +namespace { +inline ze_group_count_t to_group_count(const std::vector& v) { + switch (v.size()) { + case 1: + return {uint32_t(v[0]), uint32_t(1), uint32_t(1)}; + case 2: + return {uint32_t(v[0]), uint32_t(v[1]), uint32_t(1)}; + case 3: + return {uint32_t(v[0]), uint32_t(v[1]), uint32_t(v[2])}; + default: + return {uint32_t(1), uint32_t(1), uint32_t(1)}; + } +} + +template +ze_result_t set_kernel_arg_scalar(ze_kernel_handle_t& kernel, uint32_t idx, const T& val) { + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set scalar " << idx << " (" << ov::element::from().get_type_name() << ")" << val << "\n"; + return zeKernelSetArgumentValue(kernel, idx, sizeof(T), &val); +} + +ze_result_t set_kernel_arg(ze_kernel_handle_t& kernel, uint32_t idx, cldnn::memory::cptr mem) { + if (!mem) + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + + OPENVINO_ASSERT(memory_capabilities::is_usm_type(mem->get_allocation_type()), "Unsupported alloc type"); + const auto& buf = std::dynamic_pointer_cast(mem)->get_buffer(); + auto mem_type = std::dynamic_pointer_cast(mem)->get_allocation_type(); + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set arg (" << mem_type << ") " << idx + << " mem: " << buf.get() << " size: " << mem->size() << std::endl; + + auto ptr = buf.get(); + return zeKernelSetArgumentValue(kernel, idx, sizeof(ptr), &ptr); +} + +void set_arguments_impl(ze_kernel_handle_t kernel, + const arguments_desc& args, + const kernel_arguments_data& data) { + using args_t = argument_desc::Types; + using scalar_t = scalar_desc::Types; + + for (uint32_t i = 0; i < static_cast(args.size()); i++) { + ze_result_t status = ZE_RESULT_NOT_READY; + switch (args[i].t) { + case args_t::INPUT: + if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) { + status = set_kernel_arg(kernel, i, data.inputs[args[i].index]); + } + break; + case args_t::INPUT_OF_FUSED_PRIMITIVE: + if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) { + status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]); + } + break; + case args_t::INTERNAL_BUFFER: + if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) { + status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]); + } + break; + case args_t::OUTPUT: + if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) { + status = set_kernel_arg(kernel, i, data.outputs[args[i].index]); + } + break; + case args_t::WEIGHTS: + status = set_kernel_arg(kernel, i, data.weights); + break; + case args_t::BIAS: + status = set_kernel_arg(kernel, i, data.bias); + break; + case args_t::WEIGHTS_ZERO_POINTS: + status = set_kernel_arg(kernel, i, data.weights_zero_points); + break; + case args_t::ACTIVATIONS_ZERO_POINTS: + status = set_kernel_arg(kernel, i, data.activations_zero_points); + break; + case args_t::COMPENSATION: + status = set_kernel_arg(kernel, i, data.compensation); + break; + case args_t::SCALE_TABLE: + status = set_kernel_arg(kernel, i, data.scale_table); + break; + case args_t::SLOPE: + status = set_kernel_arg(kernel, i, data.slope); + break; + case args_t::SCALAR: + if (data.scalars && args[i].index < data.scalars->size()) { + const auto& scalar = (*data.scalars)[args[i].index]; + switch (scalar.t) { + case scalar_t::UINT8: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u8); + break; + case scalar_t::UINT16: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u16); + break; + case scalar_t::UINT32: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u32); + break; + case scalar_t::UINT64: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u64); + break; + case scalar_t::INT8: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s8); + break; + case scalar_t::INT16: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s16); + break; + case scalar_t::INT32: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s32); + break; + case scalar_t::INT64: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s64); + break; + case scalar_t::FLOAT32: + status = set_kernel_arg_scalar(kernel, i, scalar.v.f32); + break; + case scalar_t::FLOAT64: + status = set_kernel_arg_scalar(kernel, i, scalar.v.f64); + break; + default: + break; + } + } + break; + case args_t::CELL: + status = set_kernel_arg(kernel, i, data.cell); + break; + case args_t::SHAPE_INFO: + status = set_kernel_arg(kernel, i, data.shape_info); + break; + default: + break; + } + if (status != ZE_RESULT_SUCCESS) { + throw std::runtime_error("Error set arg " + std::to_string(i) + ", error code: " + std::to_string(status) + "\n"); + } + } +} + +} // namespace + +ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) + : stream(config.get_queue_type(), stream::get_expected_sync_method(config)) + , _engine(engine) + , m_pool(engine, config.get_enable_profiling()) { + const auto &info = engine.get_device_info(); + + ze_command_queue_desc_t command_queue_desc = {}; + command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC; + command_queue_desc.pNext = nullptr; + command_queue_desc.index = 0; + command_queue_desc.ordinal = info.compute_queue_group_ordinal; + command_queue_desc.flags = m_queue_type == QueueTypes::out_of_order ? 0 : ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; + + zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {}; + cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES; + cp_offload_desc.copyOffloadEnabled = true; + cp_offload_desc.pNext = nullptr; + command_queue_desc.pNext = &cp_offload_desc; + + ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); +} + +ze_stream::~ze_stream() { + zeCommandListDestroy(m_command_list); +} + +void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) { + static std::mutex m; + std::lock_guard guard(m); + + auto& ze_kernel = downcast(kernel); + auto& kern = ze_kernel.get_handle(); + set_arguments_impl(kern, args_desc.arguments, args); +} + +event::ptr ze_stream::enqueue_kernel(kernel& kernel, + const kernel_arguments_desc& args_desc, + const kernel_arguments_data& /* args */, + std::vector const& deps, + bool is_output) { + auto& ze_kernel = downcast(kernel); + + auto& kern = ze_kernel.get_handle(); + + std::vector dep_events; + std::vector* dep_events_ptr = nullptr; + if (m_sync_method == SyncMethods::events) { + for (auto& dep : deps) { + if (auto ze_base_ev = std::dynamic_pointer_cast(dep)) { + if (ze_base_ev->get() != nullptr) + dep_events.push_back(ze_base_ev->get()); + } + } + dep_events_ptr = &dep_events; + } else if (m_sync_method == SyncMethods::barriers) { + sync_events(deps, is_output); + } + bool set_output_event = m_sync_method == SyncMethods::events || is_output; + + auto ev = set_output_event ? create_base_event() : std::make_shared(nullptr, nullptr, ++m_queue_counter); + auto global = to_group_count(args_desc.workGroups.global); + auto local = to_group_count(args_desc.workGroups.local); + ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ }; + ZE_CHECK(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ)); + ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list, + kern, + &args, + set_output_event ? std::dynamic_pointer_cast(ev)->get() : nullptr, + dep_events_ptr == nullptr ? 0 : static_cast(dep_events_ptr->size()), + dep_events_ptr == nullptr ? 0 : &dep_events_ptr->front())); + + return ev; +} + +void ze_stream::enqueue_barrier() { + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); +} + +event::ptr ze_stream::enqueue_marker(std::vector const& deps, bool is_output) { + if (deps.empty()) { + auto ev = create_base_event(); + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get(), 0, nullptr)); + return ev; + } + + if (m_sync_method == SyncMethods::events) { + std::vector dep_events; + for (auto& dep : deps) { + if (auto ze_base_ev = std::dynamic_pointer_cast(dep)) { + if (ze_base_ev->get() != nullptr) + dep_events.push_back(ze_base_ev->get()); + } + } + if (dep_events.empty()) + return create_user_event(true); + + auto ev = create_base_event(); + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, + std::dynamic_pointer_cast(ev)->get(), + static_cast(dep_events.size()), + &dep_events.front())); + return ev; + } else if (m_sync_method == SyncMethods::barriers) { + sync_events(deps, is_output); + assert(m_last_barrier_ev != nullptr); + return m_last_barrier_ev; + } else { + return create_user_event(true); + } +} + +ze_event::ptr ze_stream::group_events(std::vector const& deps) { + return std::make_shared(deps); +} + +void ze_stream::wait() { + finish(); +} + +event::ptr ze_stream::create_user_event(bool set) { + auto ev = m_pool.create_user_event(); + if (set) + ev->set(); + + return ev; +} + +event::ptr ze_stream::create_base_event() { + return m_pool.create_event(++m_queue_counter); +} + +void ze_stream::flush() const { } + +void ze_stream::finish() const { + ZE_CHECK(zeCommandListHostSynchronize(m_command_list, default_timeout)); +} + +void ze_stream::wait_for_events(const std::vector& events) { + for (auto& ev : events) { + ev->wait(); + } + + // Enqueue additional event as `events` may contain user events only due to barrier based synchronization + // TODO: Detect that scenarion somehow and don't enqueue extra barrier if not needed + auto ev = std::dynamic_pointer_cast(create_base_event()); + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, ev->get(), 0, nullptr)); + ev->wait(); +} + +void ze_stream::sync_events(std::vector const& deps, bool is_output) { + bool needs_barrier = false; + for (auto& dep : deps) { + auto* ze_base_ev = dynamic_cast(dep.get()); + assert(ze_base_ev != nullptr); + if (ze_base_ev->get_queue_stamp() > m_last_barrier) { + needs_barrier = true; + } + } + + if (needs_barrier) { + if (is_output) { + m_last_barrier_ev = std::dynamic_pointer_cast(create_base_event()); + m_last_barrier_ev->set_queue_stamp(m_queue_counter.load()); + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get(), 0, nullptr)); + } else { + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); + } + m_last_barrier = ++m_queue_counter; + } + + if (!m_last_barrier_ev) { + m_last_barrier_ev = std::dynamic_pointer_cast(create_user_event(true)); + m_last_barrier_ev->set_queue_stamp(m_queue_counter.load()); + } +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp new file mode 100644 index 00000000000000..8af269489fbe3b --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/event.hpp" +#include "intel_gpu/runtime/stream.hpp" +#include "ze_common.hpp" +#include "ze_engine.hpp" +#include "ze_event.hpp" + +namespace cldnn { +namespace ze { + +class ze_stream : public stream { +public: + ze_command_list_handle_t get_queue() const { return m_command_list; } + + ze_stream(const ze_engine& engine, const ExecutionConfig& config); + ze_stream(ze_stream&& other) + : stream(other.m_queue_type, other.m_sync_method) + , _engine(other._engine) + , m_command_list(other.m_command_list) + , m_queue_counter(other.m_queue_counter.load()) + , m_last_barrier(other.m_last_barrier.load()) + , m_last_barrier_ev(other.m_last_barrier_ev) + , m_pool(other.m_pool) {} + + ~ze_stream(); + + void flush() const override; + void finish() const override; + void wait() override; + + void set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) override; + event::ptr enqueue_kernel(kernel& kernel, + const kernel_arguments_desc& args_desc, + const kernel_arguments_data& args, + std::vector const& deps, + bool is_output = false) override; + event::ptr enqueue_marker(std::vector const& deps, bool is_output) override; + event::ptr group_events(std::vector const& deps) override; + void wait_for_events(const std::vector& events) override; + void enqueue_barrier() override; + event::ptr create_user_event(bool set) override; + event::ptr create_base_event() override; + +#ifdef ENABLE_ONEDNN_FOR_GPU + dnnl::stream& get_onednn_stream() override { OPENVINO_NOT_IMPLEMENTED; } +#endif + +private: + void sync_events(std::vector const& deps, bool is_output = false); + + const ze_engine& _engine; + mutable ze_command_list_handle_t m_command_list = 0; + mutable std::atomic m_queue_counter{0}; + std::atomic m_last_barrier{0}; + std::shared_ptr m_last_barrier_ev = nullptr; + ze_events_pool m_pool; + +#ifdef ENABLE_ONEDNN_FOR_GPU + std::shared_ptr _onednn_stream = nullptr; +#endif +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index c89083a026aed6..f08e5c8b8ac798 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -39,6 +39,8 @@ ov_add_test_target( OV GPU ) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) + if(ENABLE_PROXY) target_compile_definitions(${TARGET_NAME} PUBLIC PROXY_PLUGIN_ENABLED) endif() diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp similarity index 99% rename from src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp rename to src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp index 062102e4580ee6..ad631e2ad76daf 100644 --- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#ifdef OV_GPU_WITH_OCL_RT + #include "openvino/core/preprocess/pre_post_process.hpp" #include "openvino/op/add.hpp" #include "openvino/op/constant.hpp" @@ -2962,3 +2964,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_RemoteTensorDataType, OVRemoteTensorDataType_Test ov::element::Type_t::u16, ov::element::Type_t::u32)), OVRemoteTensorDataType_Test::getTestCaseName); +#endif // OV_GPU_WITH_OCL_RT diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp new file mode 100644 index 00000000000000..e9ccd21cc892be --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef OV_GPU_WITH_ZE_RT + +#include "openvino/runtime/intel_gpu/remote_properties.hpp" +#include "openvino/runtime/remote_tensor.hpp" + +#include "remote_tensor_tests/helpers.hpp" +#include "base/ov_behavior_test_utils.hpp" + +TEST(ZeRemoteContext, smoke_CorrectContextType) { + auto core = ov::Core(); + auto remote_context = core.get_default_context(ov::test::utils::DEVICE_GPU); + ASSERT_FALSE(remote_context.is()); + ASSERT_EQ(remote_context.get_params().at(ov::intel_gpu::context_type.name()), ov::intel_gpu::ContextType::ZE); +} + +#endif // OV_GPU_WITH_ZE_RT diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt index aa40a800295f02..0ba7a6e8c33e88 100644 --- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt @@ -59,6 +59,7 @@ add_executable(${TARGET_NAME} ${SOURCES_ALL}) target_compile_definitions(${TARGET_NAME} PRIVATE CI_BUILD_NUMBER="") ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) # Workaround to avoid warnings during LTO build if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp index a12d542b076498..98184955308b2b 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp @@ -56,7 +56,7 @@ TEST(devices_test, sort_order_single_vendor) { devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++)); devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++)); - auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list); + auto sorted_list = sort_devices(devices_list); std::vector expected_devices_order = {2, 0, 1, 3, 4}; @@ -77,7 +77,7 @@ TEST(devices_test, sort_order_two_vendors) { devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++)); devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++)); - auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list); + auto sorted_list = sort_devices(devices_list); std::vector expected_devices_order = {3, 2, 0, 1}; @@ -101,7 +101,7 @@ TEST(devices_test, sort_order_three_vendors) { devices_list.push_back(std::make_shared(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++)); devices_list.push_back(std::make_shared(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++)); - auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list); + auto sorted_list = sort_devices(devices_list); std::vector expected_devices_order = {2, 3, 0, 1, 4, 5}; diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp index 4e3942cf578758..239e1d1b34a49b 100644 --- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp @@ -302,7 +302,7 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine, } std::shared_ptr create_test_engine() { - auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl); + auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze); #ifdef ENABLE_ONEDNN_FOR_GPU if (ret->get_device_info().supports_immad) ret->create_onednn_engine({}); diff --git a/thirdparty/dependencies.cmake b/thirdparty/dependencies.cmake index 70318227538112..e3317d176e11c4 100644 --- a/thirdparty/dependencies.cmake +++ b/thirdparty/dependencies.cmake @@ -68,7 +68,7 @@ endif() # LevelZero # -if(ENABLE_INTEL_NPU) +if(ENABLE_INTEL_GPU OR ENABLE_INTEL_NPU) if(ENABLE_SYSTEM_LEVEL_ZERO) pkg_search_module(level_zero QUIET IMPORTED_TARGET diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt index a266b0d575abd5..6675adc586caee 100644 --- a/thirdparty/level_zero/CMakeLists.txt +++ b/thirdparty/level_zero/CMakeLists.txt @@ -26,7 +26,7 @@ endif() set(CMAKE_COMPILE_WARNING_AS_ERROR OFF) add_subdirectory(level-zero EXCLUDE_FROM_ALL) -set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $) +set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $) # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it file(REMOVE "${CMAKE_BINARY_DIR}/VERSION") diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/ze_intel_gpu.h new file mode 100644 index 00000000000000..ea83b8e9f6cad9 --- /dev/null +++ b/thirdparty/level_zero/ze_intel_gpu.h @@ -0,0 +1,412 @@ +// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5 +/* + * Copyright (C) 2020-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZE_INTEL_GPU_H +#define _ZE_INTEL_GPU_H + +#include + +#include "ze_stypes.h" + +#if defined(__cplusplus) +#pragma once +extern "C" { +#endif + +#include + +#define ZE_INTEL_GPU_VERSION_MAJOR 0 +#define ZE_INTEL_GPU_VERSION_MINOR 1 + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME +/// @brief Module DP properties driver extension name +#define ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME "ZE_intel_experimental_device_module_dp_properties" +#endif // ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Module DP properties driver extension Version(s) +typedef enum _ze_intel_device_module_dp_properties_exp_version_t { + ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_module_dp_properties_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported Dot Product flags +typedef uint32_t ze_intel_device_module_dp_exp_flags_t; +typedef enum _ze_intel_device_module_dp_exp_flag_t { + ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DP4A = ZE_BIT(0), ///< Supports DP4A operation + ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS = ZE_BIT(1), ///< Supports DPAS operation + ZE_INTEL_DEVICE_MODULE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_module_dp_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Device Module dot product properties queried using +/// ::zeDeviceGetModuleProperties +/// +/// @details +/// - This structure may be passed to ::zeDeviceGetModuleProperties, via +/// `pNext` member of ::ze_device_module_properties_t. +/// @brief Device module dot product properties +typedef struct _ze_intel_device_module_dp_exp_properties_t { + ze_structure_type_ext_t stype = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES; ///< [in] type of this structure + void *pNext; ///< [in,out][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains sType and pNext). + ze_intel_device_module_dp_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_module_dp_flag_t +} ze_intel_device_module_dp_exp_properties_t; + +#ifndef ZE_INTEL_COMMAND_LIST_MEMORY_SYNC +/// @brief Cmd List memory sync extension name +#define ZE_INTEL_COMMAND_LIST_MEMORY_SYNC "ZE_intel_experimental_command_list_memory_sync" +#endif // ZE_INTEL_COMMAND_LIST_MEMORY_SYNC + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Cmd List memory sync extension Version(s) +typedef enum _ze_intel_command_list_memory_sync_exp_version_t { + ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} ze_intel_command_list_memory_sync_exp_version_t; + +#ifndef ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC +/// @brief stype for _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t +#endif + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extended descriptor for cmd list memory sync +/// +/// @details +/// - Implementation must support ::ZE_intel_experimental_command_list_memory_sync extension +/// - May be passed to ze_device_properties_t through pNext. +typedef struct _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint32_t cmdListWaitOnMemoryDataSizeInBytes; /// Defines supported data size for zexCommandListAppendWaitOnMemory[64] API +} ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t; + +#ifndef ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME +/// @brief Event sync mode extension name +#define ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME "ZEX_intel_experimental_event_sync_mode" +#endif // ZE_INTEL_EVENT_SYNC_MODE_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Event sync mode extension Version(s) +typedef enum _zex_intel_event_sync_mode_exp_version_t { + ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} zex_intel_event_sync_mode_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported event sync mode flags +typedef uint32_t zex_intel_event_sync_mode_exp_flags_t; +typedef enum _zex_intel_event_sync_mode_exp_flag_t { + ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT = ZE_BIT(0), ///< Low power host synchronization mode, for better CPU utilization + ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT = ZE_BIT(1), ///< Generate interrupt when Event is signalled on Device + ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT = ZE_BIT(2), ///< Host synchronization APIs wait for external interrupt. Can be used only for Events created via zexCounterBasedEventCreate + ZEX_INTEL_EVENT_SYNC_MODE_EXP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff + +} zex_intel_event_sync_mode_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extended descriptor for event sync mode +/// +/// @details +/// - Implementation must support ::ZEX_intel_experimental_event_sync_mode extension +/// - May be passed to ze_event_desc_t through pNext. +typedef struct _zex_intel_event_sync_mode_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + zex_intel_event_sync_mode_exp_flags_t syncModeFlags; /// valid combination of ::ze_intel_event_sync_mode_exp_flag_t + uint32_t externalInterruptId; /// External interrupt id. Used only when ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT flag is set +} zex_intel_event_sync_mode_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare zex_intel_queue_allocate_msix_hint_exp_desc_t +typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t zex_intel_queue_allocate_msix_hint_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue descriptor for allocating unique msix. This structure may be +/// passed as pNext member of ::ze_command_queue_desc_t. + +typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + ze_bool_t uniqueMsix; ///< [in] If set, try to allocate unique msix for command queue. + ///< If not set, driver will follow default behaviour. It may share msix for signaling completion with other queues. + ///< Number of unique msixes may be limited. On unsuccessful allocation, queue or immediate cmd list creation API fallbacks to default behaviour. + +} zex_intel_queue_allocate_msix_hint_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue descriptor for enabling copy operations offload. This structure may be +/// passed as pNext member of ::ze_command_queue_desc_t. + +typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + ze_bool_t copyOffloadEnabled; ///< [in] If set, try to offload copy operations to different engines. Applicable only for compute queues. + ///< This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics. + ///< If not set, driver will follow default behaviour. Copy operations will be submitted to same engine as compute operations. + +} zex_intel_queue_copy_operations_offload_hint_exp_desc_t; + +#ifndef ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME +/// @brief Queue copy operations offload hint extension name +#define ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME "ZEX_intel_experimental_queue_copy_operations_offload_hint" +#endif // ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Queue copy operations offload hint extension version(s) +typedef enum _zex_intel_queue_copy_operations_offload_hint_exp_version_t { + ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} zex_intel_queue_copy_operations_offload_hint_exp_version_t; + +#ifndef ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME +/// @brief Extension name for query to read the Intel Level Zero Driver Version String +#define ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME "ZE_intel_get_driver_version_string" +#endif // ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query to read the Intel Level Zero Driver Version String extension version(s) +typedef enum _ze_intel_get_driver_version_string_exp_version_t { + ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} ze_intel_get_driver_version_string_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported 2D Block Array flags +typedef uint32_t ze_intel_device_block_array_exp_flags_t; +typedef enum _ze_intel_device_block_array_exp_flag_t { + ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_STORE = ZE_BIT(0), ///< Supports store operation + ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_LOAD = ZE_BIT(1), ///< Supports load operation + ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_block_array_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME +/// @brief Device 2D block array properties driver extension name +#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME "ZE_intel_experimental_device_block_array_properties" +#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + +/// @brief Device 2D block array properties queried using +/// ::zeDeviceGetProperties +/// +/// @details +/// - This structure may be passed to ::zeDeviceGetProperties, via +/// `pNext` member of ::ze_device_properties_t. +/// @brief Device 2D block array properties + +typedef struct _ze_intel_device_block_array_exp_properties_t { + ze_structure_type_ext_t stype = ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES; ///< [in] type of this structure + void *pNext; ///< [in,out][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains sType and pNext). + ze_intel_device_block_array_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t +} ze_intel_device_block_array_exp_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Device 2D block array properties driver extension versions +typedef enum _ze_intel_device_block_array_exp_properties_version_t { + ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_block_array_exp_properties_version_t; + +/// @brief Query to read the Intel Level Zero Driver Version String +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - The Driver Version String will be in the format: +/// - Major.Minor.Patch+Optional per semver guidelines https://semver.org/#spec-item-10 +/// @returns +/// - ::ZE_RESULT_SUCCESS +ze_result_t ZE_APICALL +zeIntelGetDriverVersionString( + ze_driver_handle_t hDriver, ///< [in] Driver handle whose version is being read. + char *pDriverVersion, ///< [in,out] pointer to driver version string. + size_t *pVersionSize); ///< [in,out] pointer to the size of the driver version string. + ///< if size is zero, then the size of the version string is returned. + +/// @brief Get Kernel Program Binary +/// +/// @details +/// - A valid kernel handle must be created with zeKernelCreate. +/// - Returns Intel Graphics Assembly (GEN ISA) format binary program data for kernel handle. +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// @returns +/// - ::ZE_RESULT_SUCCESS + +#ifndef ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME +/// @brief Get Kernel Program Binary experimental name +#define ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME "ZE_intel_experimental_kernel_get_program_binary" +#endif // ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intel Kernel Get Binary Extension Version(s) +typedef enum _ze_intel_kernel_get_binary_exp_version_t { + ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_intel_kernel_get_binary_exp_version_t; + +ze_result_t ZE_APICALL +zeIntelKernelGetBinaryExp( + ze_kernel_handle_t hKernel, ///< [in] Kernel handle + size_t *pSize, ///< [in, out] pointer to variable with size of GEN ISA binary + char *pKernelBinary ///< [in,out] pointer to storage area for GEN ISA binary function +); + +/// @brief Get default context associated with driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Default context contains all devices within driver instance +/// @returns +/// - Context handle associated with driver +ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver + +/// @brief Get default context associated with default driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Default context contains all devices within default driver instance +/// @returns +/// - Context handle associated with default driver +ze_context_handle_t ZE_APICALL zerDriverGetDefaultContext(); + +/// @brief Get Device Identifier +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returned identifier is a 32-bit unsigned integer that is unique to the driver. +/// - The identifier can be used then in zerIdentifierTranslateToDeviceHandle to get the device handle. +/// @returns +/// - 32-bit unsigned integer identifier +uint32_t ZE_APICALL zerDeviceTranslateToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device + +/// @brief Translate Device Identifier to Device Handle from default Driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returned device is associated to default driver handle. +/// @returns +/// - device handle associated with the identifier +ze_device_handle_t ZE_APICALL zerIdentifierTranslateToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device + +/// @brief Global device synchronization +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Ensures that everything that was submitted to the device is completed. +/// - Ensures that all submissions in all queues on device are completed. +/// - It is not allowed to call this function while some command list are in graph capture mode. +/// - Returns error if error is detected during execution on device. +/// - Hangs indefinitely if GPU execution is blocked on non signaled event. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +ze_result_t ZE_APICALL zeDeviceSynchronize(ze_device_handle_t hDevice); ///> [in] handle of the device + +/// @brief Get priority levels +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returns priority levels supported by the device +/// - lowestPriority reports the numerical value that corresponds to lowest queue priority +/// - highesPriority reports the numerical value that corresponds to highest queue priority +/// - Lower numbers indicate greater priorities +/// - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority] +/// - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value +/// - 0 means default priority +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +ze_result_t ZE_APICALL zeDeviceGetPriorityLevels( + ze_device_handle_t hDevice, + int *lowestPriority, + int *highestPriority); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Descriptor used for setting priority on command queues and immediate command lists. +/// This structure may be passed as pNext member of ::ze_command_queue_desc_t. +typedef struct _ze_queue_priority_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific structure + int priority; ///< [in] priority of the queue +} ze_queue_priority_desc_t; + +/// @brief Append with arguments +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Appends kernel to command list with arguments. +/// - Kernel object state is updated with new arguments, as if separate zeKernelSetArgumentValue were called. +/// - If argument is SLM (size), then SLM size in bytes for this resource is provided under pointer on specific index and its type is size_t. +/// - If argument is an immediate type (i.e. structure, non pointer type), then values under pointer must contain full size of immediate type. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hCommandList` +/// + `nullptr == hKernel` +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == pArguments` +/// - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT +/// - ::ZE_RESULT_ERROR_INVALID_SIZE +/// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` +typedef struct _ze_group_size_t { + uint32_t groupSizeX; ///< [in] local work-group size in X dimension + uint32_t groupSizeY; ///< [in] local work-group size in Y dimension + uint32_t groupSizeZ; ///< [in] local work-group size in Z dimension + +} ze_group_size_t; + +ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithArguments( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t groupCounts, ///< [in] thread group counts + const ze_group_size_t groupSizes, ///< [in] thread group sizes + void **pArguments, ///< [in] kernel arguments; pointer to list where each argument represents a pointer to the argument value on specific index + void *pNext, ///< [in][optional] extensions + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching + ze_event_handle_t *phWaitEvents); ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/ze_stypes.h new file mode 100644 index 00000000000000..19b1efac292854 --- /dev/null +++ b/thirdparty/level_zero/ze_stypes.h @@ -0,0 +1,43 @@ +// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5 +/* + * Copyright (C) 2024-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZE_STYPES_H +#define _ZE_STYPES_H + +#include +#include + +#include +using ze_structure_type_ext_t = uint32_t; +using zet_structure_type_ext_t = uint32_t; + +#define ZE_STRUCTURE_TYPE_SYNCHRONIZED_DISPATCH_EXP_DESC static_cast(0x00020020) +#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_COMMUNICATION_DESC static_cast(0x00020021) +#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_DOORBELL_HANDLE_DESC static_cast(0x00020022) +#define ZE_STRUCTURE_TYPE_INTEL_DEVICE_MEDIA_EXP_PROPERTIES static_cast(0x00020023) +#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES static_cast(0x00030007) +#define ZEX_STRUCTURE_DEVICE_MODULE_REGISTER_FILE_EXP static_cast(0x00030010) +#define ZEX_STRUCTURE_KERNEL_REGISTER_FILE_SIZE_EXP static_cast(0x00030012) +#define ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES static_cast(0x00030013) +#define ZEX_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC static_cast(0x00030016) +#define ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC static_cast(0x00030017) +#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_ALLOCATE_MSIX_HINT_EXP_PROPERTIES static_cast(0x00030018) +#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES static_cast(0x0003001B) +#define ZE_STRUCTURE_INTEL_DEVICE_MEMORY_CXL_EXP_PROPERTIES static_cast(0x00030019) +#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC static_cast(0x0003001C) +#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_SYNC_ALLOC_PROPERTIES static_cast(0x0003001D) +#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES static_cast(0x00030027) +#define ZE_STRUCTURE_TYPE_QUEUE_PRIORITY_DESC static_cast(0x00030028) + +// Metric structure types +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATE_EXP_PROPERTIES static_cast(0x00010008) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATE_DESC_EXP static_cast(0x00010009) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_SOURCE_ID_EXP static_cast(0x0001000a) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_DECODED_BUFFER_PROPERTIES_EXP static_cast(0x0001000b) + +#endif From ce58599bef5470e4fac3e7ac6c2fe31ca4c2863e Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Thu, 10 Jul 2025 16:05:29 +0000 Subject: [PATCH 02/74] Fix L0 DPAS check --- .../include/intel_gpu/runtime/device_info.hpp | 10 ++++++---- src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 1 + src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp | 12 +++++++++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 738515a67b9a1b..d121ace418bd53 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -110,6 +110,8 @@ struct device_info { bool supports_usm; ///< Does engine support unified shared memory. bool has_separate_cache; ///< Does the target hardware has separate cache for usm_device and usm_host + bool supports_cp_offload; ///< [L0] Does the command queue support copy offload + std::vector supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler uint32_t vendor_id; ///< Vendor ID @@ -131,10 +133,10 @@ struct device_info { pci_bus_info pci_info; ///< PCI bus information for the device - uint64_t timer_resolution; ///< Resolution of device timer used for profiling in cycles/sec - uint32_t kernel_timestamp_valid_bits; ///< Number of valid bits in the kernel timestamp values - uint32_t compute_queue_group_ordinal; ///< Ordinal of the command queue group with compute support - uint32_t device_memory_ordinal; ///< Ordinal of the selected global device memory + uint64_t timer_resolution; ///< [L0] Resolution of device timer used for profiling in cycles/sec + uint32_t kernel_timestamp_valid_bits; ///< [L0] Number of valid bits in the kernel timestamp values + uint32_t compute_queue_group_ordinal; ///< [L0] Ordinal of the command queue group with compute support + uint32_t device_memory_ordinal; ///< [L0] Ordinal of the selected global device memory ov::device::UUID uuid; ///< UUID of the gpu device ov::device::LUID luid; ///< LUID of the gpu device diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index e17db33c52e77c..6d65086072afe3 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -348,6 +348,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex info.kernel_timestamp_valid_bits = 0; info.compute_queue_group_ordinal = 0; info.device_memory_ordinal = 0; + info.supports_cp_offload = false; #ifdef ENABLE_ONEDNN_FOR_GPU using namespace dnnl::impl::gpu::intel::jit; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index c4fa62668675fe..0974432a0112ec 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -6,6 +6,7 @@ #include "ze_common.hpp" #include +#include #include #include #include @@ -50,6 +51,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0); bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0); bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0); + bool supports_cp_offload = + supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0); + bool supports_dp_properties = + supports_extension(extensions, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0); ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0}; ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr}; @@ -92,6 +97,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic }); ze_device_module_properties_t device_module_properties{ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES}; + ze_intel_device_module_dp_exp_properties_t dp_properties{ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES, nullptr}; + if (supports_dp_properties) { + device_module_properties.pNext = &dp_properties; + } ZE_CHECK(zeDeviceGetModuleProperties(device, &device_module_properties)); ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES}; @@ -136,9 +145,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_intel_subgroups_short = true; info.supports_intel_subgroups_char = true; info.supports_intel_required_subgroup_size = true; + info.supports_cp_offload = supports_cp_offload; info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0; - info.supports_immad = false; // FIXME + info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0; info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; From f38ae587b277fdac3db8fe8e5e655bda2d39ece6 Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Thu, 10 Jul 2025 16:13:45 +0000 Subject: [PATCH 03/74] Use copy offload only when supported --- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index c7341c9d30b5a5..46c73e8e286fea 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -185,7 +185,9 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES; cp_offload_desc.copyOffloadEnabled = true; cp_offload_desc.pNext = nullptr; - command_queue_desc.pNext = &cp_offload_desc; + if (info.supports_cp_offload) { + command_queue_desc.pNext = &cp_offload_desc; + } ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); } From 8aaec53e819a17222962438e1ba0e31904af3550 Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Fri, 11 Jul 2025 13:26:04 +0000 Subject: [PATCH 04/74] Remove dlopen for L0 --- .../intel_gpu/src/runtime/ze/ze_common.cpp | 39 ------------------- .../intel_gpu/src/runtime/ze/ze_common.hpp | 7 ---- .../intel_gpu/src/runtime/ze/ze_engine.cpp | 13 +------ 3 files changed, 1 insertion(+), 58 deletions(-) delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp deleted file mode 100644 index fddcda042d7fa0..00000000000000 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "ze_common.hpp" -#include "openvino/core/except.hpp" - -#if defined(__linux__) -#include -#elif defined(_WIN32) -#include "windows.h" -#else -#error "Level Zero is supported on Linux and Windows only" -#endif - -namespace cldnn { -namespace ze { - -void *find_ze_symbol(const char *symbol) { -#if defined(__linux__) - void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL); -#elif defined(_WIN32) - HMODULE handle = LoadLibraryA("ze_loader.dll"); -#endif - if (!handle) { - return nullptr; - } - -#if defined(__linux__) - void *f = dlsym(handle, symbol); -#elif defined(_WIN32) - void *f = GetProcAddress(handle, symbol); -#endif - OPENVINO_ASSERT(f != nullptr); - return f; -} - -} // namespace ze -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp index 562167458288f3..fc7f98810611e7 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -21,12 +21,5 @@ namespace ze { static constexpr uint64_t default_timeout = std::numeric_limits::max(); -void* find_ze_symbol(const char *symbol); - -template -F find_ze_symbol(const char *symbol) { - return (F)find_ze_symbol(symbol); -} - } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index a493ac23005ab0..444169d3dda779 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -23,17 +23,6 @@ namespace ze { namespace { -void func_zeModuleCreate(ze_context_handle_t hContext, - ze_device_handle_t hDevice, const ze_module_desc_t *desc, - ze_module_handle_t *phModule, - ze_module_build_log_handle_t *phBuildLog) { - static auto f = find_ze_symbol("zeModuleCreate"); - - if (!f) - throw std::runtime_error("zeModuleCreate was not found"); - ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog)); -} - ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& engine, std::vector binary) { auto desc = ze_module_desc_t(); desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; @@ -47,7 +36,7 @@ ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& auto ze_device = engine.get_device(); auto ze_ctx = engine.get_context(); - func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr); + zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr); return ze_module; } From f8eb99a5d2e4925cc257a36d8917435a704882a8 Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Mon, 4 Aug 2025 13:56:24 +0000 Subject: [PATCH 05/74] Add dep_events for l0 mem fill --- .../intel_gpu/src/runtime/ze/ze_memory.cpp | 28 ++++++++++++++++--- .../intel_gpu/src/runtime/ze/ze_memory.hpp | 4 +-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 3b65bf73e3c2a7..49048af8572705 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -23,6 +23,18 @@ static inline cldnn::event::ptr create_event(stream& stream, size_t bytes_count) return stream.create_base_event(); } +std::vector get_ze_events(const std::vector& events) { + std::vector ze_events; + ze_events.reserve(events.size()); + for (const auto& ev : events) { + auto ze_event = downcast(ev.get())->get(); + if (ze_event != nullptr) { + ze_events.push_back(ze_event); + } + } + return ze_events; +} + } // namespace allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) { @@ -123,12 +135,20 @@ void gpu_usm::unlock(const stream& /* stream */) { } } -event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) { +event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vector& dep_events, bool blocking) { auto& _ze_stream = downcast(stream); auto ev = _ze_stream.create_base_event(); auto ev_ze = downcast(ev.get())->get(); std::vector temp_buffer(_bytes_count, pattern); - ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), _buffer.get(), temp_buffer.data(), 1, _bytes_count, ev_ze, 0, nullptr)); + auto ze_dep_events = get_ze_events(dep_events); + ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), + _buffer.get(), + temp_buffer.data(), + 1, + _bytes_count, + ev_ze, + ze_dep_events.size(), + ze_dep_events.data())); if (blocking) { ev->wait(); @@ -136,8 +156,8 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, bool blocking) { return ev; } -event::ptr gpu_usm::fill(stream& stream, bool blocking) { - return fill(stream, 0, blocking); +event::ptr gpu_usm::fill(stream& stream, const std::vector& dep_events, bool blocking) { + return fill(stream, 0, dep_events, blocking); } event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp index b1914de00bec49..acb694b2eb4b2f 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp @@ -132,8 +132,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory { const ze::UsmMemory& get_buffer() const { return _buffer; } ze::UsmMemory& get_buffer() { return _buffer; } - event::ptr fill(stream& stream, unsigned char pattern, bool blocking = true) override; - event::ptr fill(stream& stream, bool blocking = true) override; + event::ptr fill(stream& stream, unsigned char pattern, const std::vector& dep_events = {}, bool blocking = true) override; + event::ptr fill(stream& stream, const std::vector& dep_events = {}, bool blocking = true) override; shared_mem_params get_internal_params() const override; void* buffer_ptr() const override { return _buffer.get(); } From 44d0a7962661036ad962bb371cdbe4b240bdabbc Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Tue, 5 Aug 2025 15:33:14 +0000 Subject: [PATCH 06/74] Add OneDNN with L0 support --- cmake/features.cmake | 5 +---- src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 11 +++++++++-- src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 6 +++--- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/cmake/features.cmake b/cmake/features.cmake index 13aa0d20c77c14..346e0cce17fa43 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -41,13 +41,10 @@ else() set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON) endif() +# Set default GPU runtime to L0 for now set(OV_GPU_DEFAULT_RT "L0") if (ENABLE_INTEL_GPU) ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL) - if (GPU_RT_TYPE STREQUAL "L0") - # There's no interop with native L0 in onednn API. Temporary disable onednn when L0 runtime is selected - set(ENABLE_ONEDNN_FOR_GPU_DEFAULT OFF) - endif() endif() ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 444169d3dda779..f9e1a4874324ab 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -17,7 +17,9 @@ #include #include - +#ifdef ENABLE_ONEDNN_FOR_GPU +#include +#endif namespace cldnn { namespace ze { @@ -54,7 +56,12 @@ ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type) #ifdef ENABLE_ONEDNN_FOR_GPU void ze_engine::create_onednn_engine(const ExecutionConfig& config) { - OPENVINO_NOT_IMPLEMENTED; + const std::lock_guard lock(onednn_mutex); + OPENVINO_ASSERT(_device->get_info().vendor_id == INTEL_VENDOR_ID, "[GPU] OneDNN engine can be used for Intel GPUs only"); + if (!_onednn_engine) { + auto casted = std::dynamic_pointer_cast(_device); + _onednn_engine = std::make_shared(dnnl::l0_interop::make_engine(casted->get_driver(), casted->get_device(), casted->get_context())); + } } dnnl::engine& ze_engine::get_onednn_engine() const { diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index e47b2b1ffdbb3a..4a41b9463bfb55 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -9,7 +9,7 @@ set(XETLA_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/xetla/include/" CACHE PATH "P # if(ENABLE_ONEDNN_FOR_GPU) - function(build_onednn_gpu) + function(build_onednn_gpu GPU_RUNTIME) include(ExternalProject) set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build") set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library") @@ -113,7 +113,7 @@ if(ENABLE_ONEDNN_FOR_GPU) "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW" "-DDNNL_TARGET_ARCH=${ONEDNN_TARGET_ARCH}" "-DDNNL_CPU_RUNTIME=NONE" - "-DDNNL_GPU_RUNTIME=OCL" + "-DDNNL_GPU_RUNTIME=${GPU_RUNTIME}" "-DDNNL_LIBRARY_NAME=${DNNL_GPU_LIBRARY_NAME}" "-DCMAKE_INSTALL_PREFIX=${ONEDNN_INSTALL_DIR}" "-DDNNL_ENABLE_CONCURRENT_EXEC=ON" @@ -188,5 +188,5 @@ if(ENABLE_ONEDNN_FOR_GPU) COMPONENT ${OV_CPACK_COMP_CORE}) endif() endfunction() - build_onednn_gpu() + build_onednn_gpu(${GPU_RT_TYPE}) endif() diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 2aec7fc8c825ec..16a24c084eb18d 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 2aec7fc8c825ec8e03441d1dfd66bffa027e362c +Subproject commit 16a24c084eb18d8a7479f1147ced17f45355ab9d From c3b1a6735dcc35eb34e615d8f21a64da3fb265cd Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Mon, 11 Aug 2025 16:34:25 +0000 Subject: [PATCH 07/74] Adjust to L0 OneDNN --- .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 5 ++- .../intel_gpu/src/runtime/ocl/ocl_memory.cpp | 8 ++++ .../intel_gpu/src/runtime/ocl/ocl_stream.cpp | 4 ++ .../intel_gpu/src/runtime/ze/ze_device.cpp | 45 +++++++++++++++++-- .../intel_gpu/src/runtime/ze/ze_memory.cpp | 13 ++++++ .../intel_gpu/src/runtime/ze/ze_memory.hpp | 3 ++ .../intel_gpu/src/runtime/ze/ze_stream.cpp | 16 +++++++ .../intel_gpu/src/runtime/ze/ze_stream.hpp | 2 +- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 9 files changed, 92 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index e44ce07c8c0a76..0eec437378a501 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -63,8 +63,11 @@ void ocl_engine::create_onednn_engine(const ExecutionConfig& config) { if (!_onednn_engine) { auto casted = std::dynamic_pointer_cast(_device); OPENVINO_ASSERT(casted, "[GPU] Invalid device type stored in ocl_engine"); - +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get())); +#endif } } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index 1528f546181ff7..21bcc0ef20739e 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -229,7 +229,11 @@ event::ptr gpu_buffer::copy_to(stream& stream, void* data_ptr, size_t src_offset dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const { auto onednn_engine = _engine->get_onednn_engine(); dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE); +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get()); +#endif return dnnl_mem; } #endif @@ -653,9 +657,13 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s #ifdef ENABLE_ONEDNN_FOR_GPU dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const { auto onednn_engine = _engine->get_onednn_engine(); +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm, reinterpret_cast(_buffer.get()) + offset); return dnnl_mem; +#endif } #endif diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp index f5e9b74a5e681b..a0cb17ba40f615 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp @@ -239,7 +239,11 @@ dnnl::stream& ocl_stream::get_onednn_stream() { OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue"); OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices"); if (!_onednn_stream) { +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else _onednn_stream = std::make_shared(dnnl::ocl_interop::make_stream(_engine.get_onednn_engine(), _command_queue.get())); +#endif } return *_onednn_stream; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index 0974432a0112ec..339b381c0e2e51 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -24,10 +24,32 @@ #include #endif +#ifdef ENABLE_ONEDNN_FOR_GPU +#include "gpu/intel/jit/generator.hpp" +#endif + namespace cldnn { namespace ze { namespace { +#ifdef ENABLE_ONEDNN_FOR_GPU +//TODO merge this with ocl_device +gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { + switch (gpu_arch) { + case ngen::HW::Gen9: return gpu_arch::gen9; + case ngen::HW::Gen11: return gpu_arch::gen11; + case ngen::HW::XeLP: return gpu_arch::xe_lp; + case ngen::HW::XeHP: return gpu_arch::xe_hp; + case ngen::HW::XeHPG: return gpu_arch::xe_hpg; + case ngen::HW::XeHPC: return gpu_arch::xe_hpc; + case ngen::HW::Xe2: return gpu_arch::xe2; + case ngen::HW::Xe3: return gpu_arch::xe3; + case ngen::HW::Gen10: + case ngen::HW::Unknown: return gpu_arch::unknown; + } + return gpu_arch::unknown; +} +#endif bool supports_extension(const std::vector& extensions, const std::string& ext_name, uint32_t ext_ver) { return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) { @@ -153,7 +175,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; info.gfx_ver = {0, 0, 0}; // could find how to retrieve this from L0 so far - info.arch = gpu_arch::unknown; info.ip_version = ip_version_properties.ipVersion; info.sub_device_idx = (std::numeric_limits::max)(); @@ -208,6 +229,23 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic } } +#ifdef ENABLE_ONEDNN_FOR_GPU + using namespace dnnl::impl::gpu::intel::jit; + // Create temporary context just for OneDNN HW detection + ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 }; + ze_context_handle_t context; + ZE_CHECK(zeContextCreate(driver, &context_desc, &context)); + ngen::Product product = ngen::LevelZeroCodeGenerator::detectHWInfo(context, device); + zeContextDestroy(context); + info.arch = convert_ngen_arch(ngen::getCore(product.family)); + + if (product.family == ngen::ProductFamily::Unknown) { + info.supports_immad = false; + } +#else // ENABLE_ONEDNN_FOR_GPU + info.arch = gpu_arch::unknown; +#endif // ENABLE_ONEDNN_FOR_GPU + return info; } @@ -275,8 +313,9 @@ void ze_device::set_mem_caps(const memory_capabilities& memory_capabilities) { } ze_device::~ze_device() { - if (_is_initialized) - zeContextDestroy(_context); + //FIXME segfault + //if (_is_initialized) + // zeContextDestroy(_context); } } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 49048af8572705..0c774e35b2ce7d 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -11,6 +11,10 @@ #include #include +#ifdef ENABLE_ONEDNN_FOR_GPU +#include +#endif + namespace cldnn { namespace ze { namespace { @@ -236,6 +240,15 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s return result_event; } +#ifdef ENABLE_ONEDNN_FOR_GPU +dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const { + auto onednn_engine = _engine->get_onednn_engine(); + dnnl::memory dnnl_mem = dnnl::l0_interop::make_memory(desc, onednn_engine, + reinterpret_cast(_buffer.get()) + offset); + return dnnl_mem; +} +#endif + shared_mem_params gpu_usm::get_internal_params() const { auto casted = downcast(_engine); return { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp index acb694b2eb4b2f..b4368dc43b34e0 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp @@ -140,6 +140,9 @@ struct gpu_usm : public lockable_gpu_mem, public memory { event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; event::ptr copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; event::ptr copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const override; +#ifdef ENABLE_ONEDNN_FOR_GPU + dnnl::memory get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const override; +#endif static allocation_type detect_allocation_type(const ze_engine* engine, const void* mem_ptr); static allocation_type detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 46c73e8e286fea..c68c06ad0c3cf2 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -23,6 +23,10 @@ #include #include +#ifdef ENABLE_ONEDNN_FOR_GPU +#include +#endif + namespace cldnn { namespace ze { @@ -346,5 +350,17 @@ void ze_stream::sync_events(std::vector const& deps, bool is_output) } } +#ifdef ENABLE_ONEDNN_FOR_GPU +dnnl::stream& ze_stream::get_onednn_stream() { + OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue"); + OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices"); + if (!_onednn_stream) { + _onednn_stream = std::make_shared(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list)); + } + + return *_onednn_stream; +} +#endif + } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp index 8af269489fbe3b..6dc270e399844d 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -47,7 +47,7 @@ class ze_stream : public stream { event::ptr create_base_event() override; #ifdef ENABLE_ONEDNN_FOR_GPU - dnnl::stream& get_onednn_stream() override { OPENVINO_NOT_IMPLEMENTED; } + dnnl::stream& get_onednn_stream() override; #endif private: diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 16a24c084eb18d..55d2ccb866428a 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 16a24c084eb18d8a7479f1147ced17f45355ab9d +Subproject commit 55d2ccb866428ace7c2eeff221b60a6c1501c6fb From 7f79b07cb507f982d543865a29b93f8ec8d029cd Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Tue, 12 Aug 2025 15:53:55 +0000 Subject: [PATCH 08/74] Disable OneDNN pooling --- src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp index 8a308d505cf4b1..81c4c27189cb83 100644 --- a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp +++ b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp @@ -16,7 +16,8 @@ using namespace cldnn; const std::vector>& Registry::get_implementations() { static const std::vector> impls = { - OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) { + //FIXME: Disable for now as there is some issue when creating OneDNN descriptor - returns unimplemented + /*OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) { const auto& in_layout = node.get_input_layout(0); const auto& out_layout = node.get_output_layout(0); // Disable this case due to sporadic hang for the following case: @@ -27,7 +28,7 @@ const std::vector>& Registry Date: Wed, 13 Aug 2025 09:18:08 +0000 Subject: [PATCH 09/74] Update L0 OneDNN submodule --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 55d2ccb866428a..4292538ad8c45e 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 55d2ccb866428ace7c2eeff221b60a6c1501c6fb +Subproject commit 4292538ad8c45e1691c85951c54f4a689b8f6597 From 35e492d39d32a630f09f51f89aa0b534da30d9de Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Wed, 13 Aug 2025 16:54:54 +0000 Subject: [PATCH 10/74] Adjust to new OneDNN --- .../intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp | 2 +- src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp | 2 +- src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp index 66027ff683ce2e..d922ebe02b0da6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp @@ -136,7 +136,7 @@ struct PrimitiveImplOCL : public cldnn::primitive_impl { } } - void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { + void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const cldnn::engine& e) override { OPENVINO_ASSERT(cached_kernel_ids.size() == _order.size()); for (size_t i = 0; i < cached_kernel_ids.size(); ++i) { _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i], e); diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp index 15c60862da1c23..160325c1f9921b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp +++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp @@ -18,7 +18,7 @@ #endif #include "gpu/intel/microkernels/package.hpp" -#include "gpu/intel/jit/gemm/include/gemmstone/microkernel_provider.hpp" +#include "gpu/intel/gemm/jit/include/gemmstone/microkernel_provider.hpp" #include "gpu/intel/microkernels/shim.hpp" #include "common/utils.hpp" diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index 4a41b9463bfb55..a48569909ba3da 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -160,7 +160,7 @@ if(ENABLE_ONEDNN_FOR_GPU) "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/gemm/include" + "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen") set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU DNNL_DLL From 6b586d3e3d45971a009b9fe534e2e9130961747c Mon Sep 17 00:00:00 2001 From: Jakub Kasprzak Date: Mon, 18 Aug 2025 12:38:02 +0000 Subject: [PATCH 11/74] Fix include --- .../functional/remote_tensor_tests/ze_remote_tensor_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp index e9ccd21cc892be..f1a0b17b85e565 100644 --- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp @@ -8,7 +8,7 @@ #include "openvino/runtime/remote_tensor.hpp" #include "remote_tensor_tests/helpers.hpp" -#include "base/ov_behavior_test_utils.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" TEST(ZeRemoteContext, smoke_CorrectContextType) { auto core = ov::Core(); From afe606bf0872ac6aa0ee4d63e4c66b9d63191034 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 29 Aug 2025 16:38:59 +0000 Subject: [PATCH 12/74] Add new OneDNN for L0 --- src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 4 ++-- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 6d65086072afe3..78163899bfda57 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -51,7 +51,7 @@ namespace ocl { namespace { -#ifdef ENABLE_ONEDNN_FOR_GPU +#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { switch (gpu_arch) { case ngen::HW::Gen9: return gpu_arch::gen9; @@ -350,7 +350,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex info.device_memory_ordinal = 0; info.supports_cp_offload = false; -#ifdef ENABLE_ONEDNN_FOR_GPU +#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) using namespace dnnl::impl::gpu::intel::jit; ngen::Product product = ngen::OpenCLCodeGenerator::detectHWInfo(context.get(), device.get()); info.arch = convert_ngen_arch(ngen::getCore(product.family)); diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 4292538ad8c45e..c720c108a081cf 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 4292538ad8c45e1691c85951c54f4a689b8f6597 +Subproject commit c720c108a081cf719e326ee942f078d0210cee40 From 66a9c6c0b335e563df8d0157ed9dc73e56382ad6 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 4 Sep 2025 07:40:00 +0000 Subject: [PATCH 13/74] Update L0 OneDNN submodule --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index c720c108a081cf..f9ad03918013da 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit c720c108a081cf719e326ee942f078d0210cee40 +Subproject commit f9ad03918013daa012de817ec3893ff4500f90a6 From 02d4a15448a7cf84879c9dffcb134aad76ce1334 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 5 Sep 2025 15:18:52 +0000 Subject: [PATCH 14/74] Fix ze_stream impl --- .../intel_gpu/src/runtime/ze/ze_stream.cpp | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index c68c06ad0c3cf2..d6436abc33fdc2 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -197,6 +197,8 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) } ze_stream::~ze_stream() { + // Destroy OneDNN stream before destroying command list + _onednn_stream.reset(); zeCommandListDestroy(m_command_list); } @@ -305,22 +307,30 @@ event::ptr ze_stream::create_base_event() { return m_pool.create_event(++m_queue_counter); } -void ze_stream::flush() const { } +void ze_stream::flush() const { + //Immediate Command List submits commands immediately - no flush impl +} void ze_stream::finish() const { ZE_CHECK(zeCommandListHostSynchronize(m_command_list, default_timeout)); } void ze_stream::wait_for_events(const std::vector& events) { + bool needs_sync = false; for (auto& ev : events) { + auto* ze_base_ev = dynamic_cast(ev.get()); + if (ze_base_ev->get() != nullptr) { + ze_base_ev->wait(); + } else { + needs_sync = true; + } + // Block thread and wait for event signal ev->wait(); } - // Enqueue additional event as `events` may contain user events only due to barrier based synchronization - // TODO: Detect that scenarion somehow and don't enqueue extra barrier if not needed - auto ev = std::dynamic_pointer_cast(create_base_event()); - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, ev->get(), 0, nullptr)); - ev->wait(); + if (needs_sync) { + finish(); + } } void ze_stream::sync_events(std::vector const& deps, bool is_output) { From 40d7e4b58f585a396dd7f5a1b56b8e4c9b96713f Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 10 Sep 2025 15:40:32 +0000 Subject: [PATCH 15/74] Update onednn l0 submodule --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index f9ad03918013da..a546d2673382e1 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit f9ad03918013daa012de817ec3893ff4500f90a6 +Subproject commit a546d2673382e18adcc0d0c9cbf8337ffc2a5a8c From b7c56f5c2967f74fea9728f9bfb5a7cea0f05bfa Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 11 Sep 2025 12:58:46 +0000 Subject: [PATCH 16/74] Add dedicated copy queue --- .../include/intel_gpu/runtime/device_info.hpp | 3 ++- .../intel_gpu/src/runtime/ze/ze_device.cpp | 7 ++++++- .../intel_gpu/src/runtime/ze/ze_memory.cpp | 8 ++++---- .../intel_gpu/src/runtime/ze/ze_stream.cpp | 15 ++++++--------- .../intel_gpu/src/runtime/ze/ze_stream.hpp | 8 +++++++- 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index d121ace418bd53..63a6b3eb9e6014 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -135,7 +135,8 @@ struct device_info { uint64_t timer_resolution; ///< [L0] Resolution of device timer used for profiling in cycles/sec uint32_t kernel_timestamp_valid_bits; ///< [L0] Number of valid bits in the kernel timestamp values - uint32_t compute_queue_group_ordinal; ///< [L0] Ordinal of the command queue group with compute support + uint32_t compute_queue_group_ordinal; ///< [L0] Ordinal of the command queue group to use for compute + uint32_t copy_queue_group_ordinal; ///< [L0] Ordinal of the command queue group to use for copy uint32_t device_memory_ordinal; ///< [L0] Ordinal of the selected global device memory ov::device::UUID uuid; ///< UUID of the gpu device diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index 339b381c0e2e51..b462c9da1a8d78 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -96,10 +96,14 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0])); auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { - return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == true; + return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0; + }); + auto copy_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { + return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0 && (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0; }); OPENVINO_ASSERT(compute_queue_props != queue_properties.end()); + OPENVINO_ASSERT(copy_queue_props != queue_properties.end()); uint32_t memory_properties_count = 0; ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr)); @@ -190,6 +194,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.kernel_timestamp_valid_bits = device_properties.kernelTimestampValidBits; info.timer_resolution = device_properties.timerResolution; info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props); + info.copy_queue_group_ordinal = std::distance(queue_properties.begin(), copy_queue_props); static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, ""); static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, ""); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 0c774e35b2ce7d..218c3390a5c566 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -111,7 +111,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re } GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl; _host_buffer.allocateHost(_bytes_count); - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_copy_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, @@ -174,7 +174,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o auto src_ptr = reinterpret_cast(data_ptr) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(), dst_ptr, src_ptr, _bytes_count, @@ -202,7 +202,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_ auto src_ptr = reinterpret_cast(usm_mem->buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(), dst_ptr, src_ptr, _bytes_count, @@ -226,7 +226,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s auto src_ptr = reinterpret_cast(buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(data_ptr) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(), dst_ptr, src_ptr, _bytes_count, diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index d6436abc33fdc2..d4274e6d32e4ba 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -185,21 +185,18 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; - zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {}; - cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES; - cp_offload_desc.copyOffloadEnabled = true; - cp_offload_desc.pNext = nullptr; - if (info.supports_cp_offload) { - command_queue_desc.pNext = &cp_offload_desc; - } - ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); + command_queue_desc.ordinal = info.copy_queue_group_ordinal; + ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list)); } ze_stream::~ze_stream() { // Destroy OneDNN stream before destroying command list _onednn_stream.reset(); - zeCommandListDestroy(m_command_list); + if (m_command_list != nullptr) + zeCommandListDestroy(m_command_list); + if (m_copy_command_list != nullptr) + zeCommandListDestroy(m_copy_command_list); } void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp index 6dc270e399844d..e490ee67e864ec 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -16,16 +16,21 @@ namespace ze { class ze_stream : public stream { public: ze_command_list_handle_t get_queue() const { return m_command_list; } + ze_command_list_handle_t get_copy_queue() const { return m_copy_command_list; } ze_stream(const ze_engine& engine, const ExecutionConfig& config); ze_stream(ze_stream&& other) : stream(other.m_queue_type, other.m_sync_method) , _engine(other._engine) , m_command_list(other.m_command_list) + , m_copy_command_list(other.m_copy_command_list) , m_queue_counter(other.m_queue_counter.load()) , m_last_barrier(other.m_last_barrier.load()) , m_last_barrier_ev(other.m_last_barrier_ev) - , m_pool(other.m_pool) {} + , m_pool(other.m_pool) { + other.m_command_list = nullptr; + other.m_copy_command_list = nullptr; + } ~ze_stream(); @@ -55,6 +60,7 @@ class ze_stream : public stream { const ze_engine& _engine; mutable ze_command_list_handle_t m_command_list = 0; + mutable ze_command_list_handle_t m_copy_command_list = 0; mutable std::atomic m_queue_counter{0}; std::atomic m_last_barrier{0}; std::shared_ptr m_last_barrier_ev = nullptr; From ab762bfec9e33468568c7300ea44439f3395942b Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 12 Sep 2025 14:03:54 +0000 Subject: [PATCH 17/74] Fix windows compilation --- src/plugins/intel_gpu/CMakeLists.txt | 4 ++++ src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 4 ++-- src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp | 3 +++ src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 8 +++++++- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 58f5e74330a4d4..38e3dafbc3305f 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -38,6 +38,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus") endif() +if(WIN32) + add_definitions(-DNOMINMAX) +endif() + if(ENABLE_GPU_DEBUG_CAPS) add_definitions(-DGPU_DEBUG_CONFIG=1) add_definitions(-DENABLE_DEBUG_CAPS=1) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 78163899bfda57..09a8d8794d1c68 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -2,12 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef ENABLE_ONEDNN_FOR_GPU +#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) #ifndef NOMINMAX # define NOMINMAX #endif #include "gpu/intel/jit/generator.hpp" -#endif // ENABLE_ONEDNN_FOR_GPU +#endif #include "ocl_device.hpp" #include "ocl_common.hpp" diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index b462c9da1a8d78..6cef4bf50c0e5c 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -13,6 +13,9 @@ #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif #include #include #include diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index a48569909ba3da..c774352db96325 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -155,14 +155,20 @@ if(ENABLE_ONEDNN_FOR_GPU) DEPENDEES install # Ensures this runs after install ) endif() - + if(GPU_RUNTIME STREQUAL "L0") + set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_L0) + elseif(GPU_RUNTIME STREQUAL "OCL") + set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_OCL) + endif() set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include" + "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party" "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen") set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU + DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE} DNNL_DLL DNNL_DLL_EXPORTS DNNL_ENABLE_CPU_ISA_HINTS From 8f48750bd1dee0d0ed948c962f267c65c638068d Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 16 Sep 2025 15:03:50 +0000 Subject: [PATCH 18/74] Fix onednn_gpu submodule --- .gitmodules | 2 +- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index d9733bc0d844c1..3b95d663eabc6b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -51,7 +51,7 @@ url = https://github.com/nithinn/ncc.git [submodule "thirdparty/onednn_gpu"] path = src/plugins/intel_gpu/thirdparty/onednn_gpu - url = https://github.com/oneapi-src/oneDNN.git + url = https://github.com/jkasprza/oneDNN.git [submodule "thirdparty/json/nlohmann_json"] path = thirdparty/json/nlohmann_json url = https://github.com/nlohmann/json.git diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index a546d2673382e1..ce0e98bf72a6c7 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit a546d2673382e18adcc0d0c9cbf8337ffc2a5a8c +Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f From 862df7e999dcaf65535649cbd1179e130a96d1c3 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 19 Sep 2025 14:18:34 +0000 Subject: [PATCH 19/74] Handle local memory size argument for L0 --- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index d4274e6d32e4ba..1e865cb421d252 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -50,6 +50,14 @@ ze_result_t set_kernel_arg_scalar(ze_kernel_handle_t& kernel, uint32_t idx, cons return zeKernelSetArgumentValue(kernel, idx, sizeof(T), &val); } +ze_result_t set_kernel_arg_local_memory(ze_kernel_handle_t& kernel, uint32_t idx, size_t size) { + if (size == 0) + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set arg " << idx << " local memory size: " << size << std::endl; + return zeKernelSetArgumentValue(kernel, idx, size, NULL); +} + ze_result_t set_kernel_arg(ze_kernel_handle_t& kernel, uint32_t idx, cldnn::memory::cptr mem) { if (!mem) return ZE_RESULT_ERROR_INVALID_ARGUMENT; @@ -159,6 +167,11 @@ void set_arguments_impl(ze_kernel_handle_t kernel, case args_t::SHAPE_INFO: status = set_kernel_arg(kernel, i, data.shape_info); break; + case args_t::LOCAL_MEMORY_SIZE: + OPENVINO_ASSERT(args[i].index < data.local_memory_args->size() && data.local_memory_args->at(args[i].index), + "The allocated local memory is necessary to set kernel arguments."); + status = set_kernel_arg_local_memory(kernel, i, data.local_memory_args->at(args[i].index)); + break; default: break; } From fedec683418367775b4832c105a487390c69bd3f Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 24 Sep 2025 14:57:48 +0000 Subject: [PATCH 20/74] Add separate onednn submodule for L0 and OCL * set ocl runtime as default --- .gitmodules | 5 ++++- cmake/features.cmake | 4 ++-- src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp | 4 ++++ src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 7 ++++++- src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 1 + src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 6 files changed, 18 insertions(+), 5 deletions(-) create mode 160000 src/plugins/intel_gpu/thirdparty/l0_onednn_gpu diff --git a/.gitmodules b/.gitmodules index 3b95d663eabc6b..fdd61a6915652a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -51,7 +51,7 @@ url = https://github.com/nithinn/ncc.git [submodule "thirdparty/onednn_gpu"] path = src/plugins/intel_gpu/thirdparty/onednn_gpu - url = https://github.com/jkasprza/oneDNN.git + url = https://github.com/oneapi-src/oneDNN.git [submodule "thirdparty/json/nlohmann_json"] path = thirdparty/json/nlohmann_json url = https://github.com/nlohmann/json.git @@ -93,3 +93,6 @@ [submodule "src/plugins/intel_cpu/thirdparty/xbyak_riscv"] path = src/plugins/intel_cpu/thirdparty/xbyak_riscv url = https://github.com/herumi/xbyak_riscv.git +[submodule "src/plugins/intel_gpu/thirdparty/l0_onednn_gpu"] + path = src/plugins/intel_gpu/thirdparty/l0_onednn_gpu + url = https://github.com/jkasprza/oneDNN.git diff --git a/cmake/features.cmake b/cmake/features.cmake index 04f8fb6b58395a..39fd5990840cef 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -41,8 +41,8 @@ else() set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON) endif() -# Set default GPU runtime to L0 for now -set(OV_GPU_DEFAULT_RT "L0") +# Set default GPU runtime to OCL +set(OV_GPU_DEFAULT_RT "OCL") if (ENABLE_INTEL_GPU) ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL) endif() diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp index 239e1d1b34a49b..6462b386f76a45 100644 --- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp @@ -302,7 +302,11 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine, } std::shared_ptr create_test_engine() { +#ifdef OV_GPU_WITH_ZE_RT auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze); +#elif OV_GPU_WITH_OCL_RT + auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl); +#endif #ifdef ENABLE_ONEDNN_FOR_GPU if (ret->get_device_info().supports_immad) ret->create_onednn_engine({}); diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index c774352db96325..9ebcaaae3c53a8 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -99,11 +99,16 @@ if(ENABLE_ONEDNN_FOR_GPU) set(onednn_gpu_lib "${CMAKE_STATIC_LIBRARY_PREFIX}${DNNL_GPU_LIBRARY_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") set(ONEDNN_GPU_LIB_PATH ${ONEDNN_INSTALL_DIR}/lib/${onednn_gpu_lib} CACHE FILEPATH "Path to oneDNN GPU library") + if(GPU_RUNTIME STREQUAL "L0") + set(ONEDNN_GPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/l0_onednn_gpu CACHE FILEPATH "Path to oneDNN GPU repository") + elseif(GPU_RUNTIME STREQUAL "OCL") + set(ONEDNN_GPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu CACHE FILEPATH "Path to oneDNN GPU repository") + endif() ExternalProject_Add(onednn_gpu_build # Directory Options: PREFIX "${ONEDNN_PREFIX_DIR}" - SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu" + SOURCE_DIR "${ONEDNN_GPU_DIR}" BINARY_DIR "${ONEDNN_BUILD_DIR}" INSTALL_DIR "${ONEDNN_INSTALL_DIR}" # Configure Step Options: diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu new file mode 160000 index 00000000000000..ce0e98bf72a6c7 --- /dev/null +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -0,0 +1 @@ +Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index ce0e98bf72a6c7..d8fb6faac0418b 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f +Subproject commit d8fb6faac0418b03a598d8f6aaaa865610ff9f40 From 8725b28c821691307f77d9fd51bdf3dd64bffdee Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 25 Sep 2025 10:06:53 +0000 Subject: [PATCH 21/74] Fix onednn include paths --- src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index 9ebcaaae3c53a8..0f0a77d83b24e1 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -166,12 +166,12 @@ if(ENABLE_ONEDNN_FOR_GPU) set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_OCL) endif() set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen") + "${ONEDNN_GPU_DIR}/src" + "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen" + "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config" + "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include" + "${ONEDNN_GPU_DIR}/third_party" + "${ONEDNN_GPU_DIR}/third_party/ngen") set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE} DNNL_DLL From 4fa5c19f06fc041b2ef721daf531df8eeec78ac2 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 25 Sep 2025 11:12:12 +0000 Subject: [PATCH 22/74] Prevent redundant opencl linking --- src/plugins/intel_gpu/cmake/utils.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake index 1dc0edebb5fe2d..5b61368a5d4742 100644 --- a/src/plugins/intel_gpu/cmake/utils.cmake +++ b/src/plugins/intel_gpu/cmake/utils.cmake @@ -8,7 +8,8 @@ function(ov_gpu_set_runtime_interface_for TARGET_NAME) target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero) elseif(GPU_RT_TYPE STREQUAL "OCL") target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1) - target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL) + # Do not link OpenCL as It is already linked to the targets that require it + # target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL) else() message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported") endif() From 933e26202f21199804d6b1e0e6b4a20f618f079e Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 26 Sep 2025 11:29:01 +0000 Subject: [PATCH 23/74] Add new compute runtime l0 headers --- thirdparty/level_zero/ze_intel_gpu.h | 501 ++++++++++++++++++++++++--- thirdparty/level_zero/ze_stypes.h | 32 +- thirdparty/level_zero/zex_common.h | 241 +++++++++++++ thirdparty/level_zero/zex_event.h | 58 ++++ 4 files changed, 780 insertions(+), 52 deletions(-) create mode 100644 thirdparty/level_zero/zex_common.h create mode 100644 thirdparty/level_zero/zex_event.h diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/ze_intel_gpu.h index ea83b8e9f6cad9..350be4de550d74 100644 --- a/thirdparty/level_zero/ze_intel_gpu.h +++ b/thirdparty/level_zero/ze_intel_gpu.h @@ -1,4 +1,4 @@ -// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5 +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac /* * Copyright (C) 2020-2025 Intel Corporation * @@ -9,7 +9,7 @@ #ifndef _ZE_INTEL_GPU_H #define _ZE_INTEL_GPU_H -#include +#include #include "ze_stypes.h" @@ -176,6 +176,17 @@ typedef enum _zex_intel_queue_copy_operations_offload_hint_exp_version_t { ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff } zex_intel_queue_copy_operations_offload_hint_exp_version_t; +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue flag for enabling copy operations offload +/// +/// If set, try to offload copy operations to different engines. Applicable only for compute queues. +/// This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics. +#define ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT ZE_BIT(2) + +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + #ifndef ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME /// @brief Extension name for query to read the Intel Level Zero Driver Version String #define ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME "ZE_intel_get_driver_version_string" @@ -214,10 +225,10 @@ typedef enum _ze_intel_device_block_array_exp_flag_t { /// @brief Device 2D block array properties typedef struct _ze_intel_device_block_array_exp_properties_t { - ze_structure_type_ext_t stype = ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES; ///< [in] type of this structure - void *pNext; ///< [in,out][optional] must be null or a pointer to an extension-specific - ///< structure (i.e. contains sType and pNext). - ze_intel_device_block_array_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t + ze_structure_type_ext_t stype; ///< [in] type of this structure + void *pNext; ///< [in,out][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains sType and pNext). + ze_intel_device_block_array_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t } ze_intel_device_block_array_exp_properties_t; /////////////////////////////////////////////////////////////////////////////// @@ -238,7 +249,7 @@ typedef enum _ze_intel_device_block_array_exp_properties_version_t { /// - Major.Minor.Patch+Optional per semver guidelines https://semver.org/#spec-item-10 /// @returns /// - ::ZE_RESULT_SUCCESS -ze_result_t ZE_APICALL +ZE_APIEXPORT ze_result_t ZE_APICALL zeIntelGetDriverVersionString( ze_driver_handle_t hDriver, ///< [in] Driver handle whose version is being read. char *pDriverVersion, ///< [in,out] pointer to driver version string. @@ -255,6 +266,80 @@ zeIntelGetDriverVersionString( /// @returns /// - ::ZE_RESULT_SUCCESS +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZEX_MEMORY_FREE_CALLBACK_EXT_NAME +/// @brief Memory Free Callback Extension Name +#define ZEX_MEMORY_FREE_CALLBACK_EXT_NAME "ZEX_extension_memory_free_callback" + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Memory Free Callback Extension Version(s) +typedef enum _zex_memory_free_callback_ext_version_t { + ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_FORCE_UINT32 = 0x7fffffff ///< Value marking end of ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_* ENUMs + +} zex_memory_free_callback_ext_version_t; + +#ifndef ZEX_STRUCTURE_TYPE_MEMORY_FREE_CALLBACK_EXT_DESC +/// @brief stype for _zex_memory_free_callback_ext_desc_t +#endif + +/** + * @brief Callback function type for memory free events. + * + * This function is called when a memory free operation occurs. + * + * @param pUserData Pointer to user-defined data passed to the callback. + */ +typedef void (*zex_mem_free_callback_fn_t)(void *pUserData); + +/** + * @brief Descriptor for a memory free callback extension. + * + * This structure is used to specify a callback function that will be invoked when memory is freed. + * + * Members: + * - stype: Specifies the type of this structure. + * - pNext: Optional pointer to an extension-specific structure; must be null or point to a structure containing stype and pNext. + * - pfnCallback: Callback function to be called when memory is freed. + * - pUserData: Optional user data to be passed to the callback function. + */ +typedef struct _zex_memory_free_callback_ext_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + zex_mem_free_callback_fn_t pfnCallback; // [in] callback function to be called on memory free + void *pUserData; // [in][optional] user data passed to callback +} zex_memory_free_callback_ext_desc_t; + +/** + * @brief Registers a callback to be invoked when memory is freed. + * + * This function allows the user to register a callback that will be called + * whenever the specified memory is freed within the given context. + * + * @param hContext + * [in] Handle to the context in which the memory was allocated. + * @param hFreeCallbackDesc + * [in] Pointer to a descriptor specifying the callback function and its parameters. + * @param ptr + * [in] Pointer to the memory for which the free callback is to be registered. + * +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_ARGUMENT +/// + `nullptr == hFreeCallbackDesc` +/// + `nullptr == ptr` + * + * @note The callback will be invoked when the specified memory is freed. + */ +ZE_APIEXPORT ze_result_t ZE_APICALL zexMemFreeRegisterCallbackExt(ze_context_handle_t hContext, zex_memory_free_callback_ext_desc_t *hFreeCallbackDesc, void *ptr); +#endif // ZEX_MEMORY_FREE_CALLBACK_EXT_NAME + #ifndef ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME /// @brief Get Kernel Program Binary experimental name #define ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME "ZE_intel_experimental_kernel_get_program_binary" @@ -269,22 +354,233 @@ typedef enum _ze_intel_kernel_get_binary_exp_version_t { } ze_intel_kernel_get_binary_exp_version_t; -ze_result_t ZE_APICALL +ZE_APIEXPORT ze_result_t ZE_APICALL zeIntelKernelGetBinaryExp( ze_kernel_handle_t hKernel, ///< [in] Kernel handle size_t *pSize, ///< [in, out] pointer to variable with size of GEN ISA binary char *pKernelBinary ///< [in,out] pointer to storage area for GEN ISA binary function ); -/// @brief Get default context associated with driver +#ifndef ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME +/// @brief DRM format modifier extension name +#define ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME "ZE_intel_experimental_drm_format_modifier" +#endif // ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief DRM format modifier extension Version(s) +typedef enum _ze_intel_drm_format_modifier_exp_version_t { + ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} ze_intel_drm_format_modifier_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Image DRM format modifier properties +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_image_desc_t, +/// when using a DRM format modifier. +/// - Properties struct for providing user with the selected drm format modifier for the image +/// - This is useful if the application wants to export the image to another API that requires the DRM format modifier +/// - The application can query the chosen DRM format modifier for the image. +/// - The application can use this information to choose a DRM format modifier for the image during creation +typedef struct _ze_intel_image_selected_format_modifier_exp_properties_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [out] DRM format modifier +} ze_intel_image_selected_format_modifier_exp_properties_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Image DRM format modifier create list +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_image_desc_t, +/// when providing a list of DRM format modifiers to choose from during image creation. +/// - This is a descriptor for creating image with the specified list of drm format modifier +/// - If the user passes a list struct, then implementation chooses one from the list of drm modifiers as it sees fit. +/// - If user wants to pass a single drm modifier then they can set the drmFormatModifierCount to 1 and pass the single drm modifier in pDrmFormatModifiers +typedef struct _ze_intel_image_format_modifier_create_list_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint32_t drmFormatModifierCount; ///< [in] number of DRM format modifiers in the list + uint64_t *pDrmFormatModifiers; ///< [in][range(0, drmFormatModifierCount)] array of DRM format modifiers +} ze_intel_image_format_modifier_create_list_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Image DRM format modifier import descriptor +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_image_desc_t, +/// when importing an image with a specific DRM format modifier. +/// - The pNext chain is setup accordingly in ze_image_desc_t prior to calling zeImageCreate API +typedef struct _ze_intel_image_format_modifier_import_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [in] DRM format modifier to use for the image +} ze_intel_image_format_modifier_import_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Buffer DRM format modifier create list +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_device_mem_alloc_desc_t, +/// when providing a list of DRM format modifiers to choose from during buffer creation. +/// - This is a descriptor for creating buffer with the specified list of drm format modifier +/// - If the user passes a list struct, then implementation chooses one from the list of drm modifiers as it sees fit. +/// - If user wants to pass a single drm modifier then they can set the drmFormatModifierCount to 1 and pass the single drm modifier in pDrmFormatModifiers +/// - The pNext chain is setup accordingly in ze_device_mem_alloc_desc_t prior to calling zeMemAllocDevice API +typedef struct _ze_intel_mem_format_modifier_create_list_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint32_t drmFormatModifierCount; ///< [in] number of DRM format modifiers in the list + uint64_t *pDrmFormatModifiers; ///< [in][range(0, drmFormatModifierCount)] array of DRM format modifiers +} ze_intel_mem_format_modifier_create_list_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Buffer DRM format modifier import descriptor +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_device_mem_alloc_desc_t, +/// when importing a buffer with a specific DRM format modifier. +/// - This descriptor must be used in conjunction with ze_external_memory_import_fd_t. If not, implementation will return an error. +/// - The pNext chain is setup accordingly in ze_device_mem_alloc_desc_t prior to calling zeMemAllocDevice API +typedef struct _ze_intel_mem_format_modifier_import_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [in] DRM format modifier to use for the buffer +} ze_intel_mem_format_modifier_import_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Buffer DRM format modifier properties +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_memory_allocation_properties_t, +/// when querying the DRM format modifier of a buffer. +/// - Properties struct for providing user with the selected drm format modifier for the buffer +/// - This is useful if the application wants to export the buffer to another API that requires the DRM format modifier +/// - The application can query the chosen DRM format modifier for the buffer via zeMemGetAllocProperties API +typedef struct _ze_intel_mem_selected_format_modifier_exp_properties_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [out] DRM format modifier +} ze_intel_mem_selected_format_modifier_exp_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query for supported DRM format modifiers for a given image descriptor /// /// @details /// - The application may call this function from simultaneous threads. /// - The implementation of this function should be lock-free. -/// - Default context contains all devices within driver instance +/// - This function can be used to query supported DRM format modifiers for a specific image description. +/// - User can use this API in two ways: +/// 1. Set pCount to the address of a uint32_t with value 0 and pDrmFormatModifiers to nullptr +/// to query just the number of supported DRM format modifiers. +/// 2. Set pCount to the address of a uint32_t with the number of elements in the pDrmFormatModifiers +/// array to retrieve the list of supported DRM format modifiers. +/// - The application can use the returned DRM format modifiers to: +/// 1. Create L0 images with supported DRM format modifiers. +/// 2. Compare with DRM format modifiers from other APIs (like Vulkan) to find common +/// modifiers that work for interop scenarios. +/// /// @returns -/// - Context handle associated with driver -ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hDevice` +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == pCount` +/// - ::ZE_RESULT_ERROR_INVALID_IMAGE_DESC +/// + The image description doesn't match the device capabilities +ze_result_t ZE_APICALL +zeIntelImageGetFormatModifiersSupportedExp( + ze_device_handle_t hDevice, ///< [in] handle of the device + const ze_image_desc_t *pImageDesc, ///< [in] pointer to image descriptor + uint32_t *pCount, ///< [in,out] pointer to the number of DRM format modifiers. + ///< if count is zero, then the driver shall update the value with the + ///< total number of supported DRM format modifiers for the image format. + ///< if count is greater than the number of supported DRM format modifiers, + ///< then the driver shall update the value with the correct number of supported DRM format modifiers. + uint64_t *pDrmFormatModifiers ///< [in,out][optional][range(0, *pCount)] array of supported DRM format modifiers +); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query for supported DRM format modifiers for a memory allocation descriptor +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - This function can be used to query supported DRM format modifiers for a specific memory allocation description. +/// - User can use this API in two ways: +/// 1. Set pCount to the address of a uint32_t with value 0 and pDrmFormatModifiers to nullptr +/// to query just the number of supported DRM format modifiers. +/// 2. Set pCount to the address of a uint32_t with the number of elements in the pDrmFormatModifiers +/// array to retrieve the list of supported DRM format modifiers. +/// - The application can use the returned DRM format modifiers to: +/// 1. Create L0 memory allocations with supported DRM format modifiers. +/// 2. Compare with DRM format modifiers from other APIs (like Vulkan) to find common +/// modifiers that work for interop scenarios. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hContext` +ze_result_t ZE_APICALL +zeIntelMemGetFormatModifiersSupportedExp( + ze_context_handle_t hContext, ///< [in] handle of the context + const ze_device_mem_alloc_desc_t *pDeviceDesc, ///< [in] pointer to device memory allocation descriptor + size_t size, ///< [in] size in bytes to allocate + size_t alignment, ///< [in] minimum alignment in bytes for the allocation + ze_device_handle_t hDevice, ///< [in] handle of the device + uint32_t *pCount, ///< [in,out] pointer to the number of DRM format modifiers. + ///< if count is zero, then the driver shall update the value with the + ///< total number of supported DRM format modifiers for the memory allocation. + ///< if count is greater than the number of supported DRM format modifiers, + ///< then the driver shall update the value with the correct number of supported DRM format modifiers. + uint64_t *pDrmFormatModifiers ///< [in,out][optional][range(0, *pCount)] array of supported DRM format modifiers +); + +/// @brief Get priority levels +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returns priority levels supported by the device +/// - lowestPriority reports the numerical value that corresponds to lowest queue priority +/// - highesPriority reports the numerical value that corresponds to highest queue priority +/// - Lower numbers indicate greater priorities +/// - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority] +/// - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value +/// - 0 means default priority +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +ze_result_t ZE_APICALL zeDeviceGetPriorityLevels( + ze_device_handle_t hDevice, + int32_t *lowestPriority, + int32_t *highestPriority); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Descriptor used for setting priority on command queues and immediate command lists. +/// This structure may be passed as pNext member of ::ze_command_queue_desc_t. +typedef struct _ze_queue_priority_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific structure + int priority; ///< [in] priority of the queue +} ze_queue_priority_desc_t; /// @brief Get default context associated with default driver /// @@ -294,7 +590,7 @@ ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDri /// - Default context contains all devices within default driver instance /// @returns /// - Context handle associated with default driver -ze_context_handle_t ZE_APICALL zerDriverGetDefaultContext(); +ZE_APIEXPORT ze_context_handle_t ZE_APICALL zerGetDefaultContext(); /// @brief Get Device Identifier /// @@ -302,10 +598,10 @@ ze_context_handle_t ZE_APICALL zerDriverGetDefaultContext(); /// - The application may call this function from simultaneous threads. /// - The implementation of this function should be lock-free. /// - Returned identifier is a 32-bit unsigned integer that is unique to the driver. -/// - The identifier can be used then in zerIdentifierTranslateToDeviceHandle to get the device handle. +/// - The identifier can be used then in zerTranslateIdentifierToDeviceHandle to get the device handle. /// @returns /// - 32-bit unsigned integer identifier -uint32_t ZE_APICALL zerDeviceTranslateToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device +ZE_APIEXPORT uint32_t ZE_APICALL zerTranslateDeviceHandleToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device /// @brief Translate Device Identifier to Device Handle from default Driver /// @@ -315,7 +611,46 @@ uint32_t ZE_APICALL zerDeviceTranslateToIdentifier(ze_device_handle_t hDevice); /// - Returned device is associated to default driver handle. /// @returns /// - device handle associated with the identifier -ze_device_handle_t ZE_APICALL zerIdentifierTranslateToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device +ZE_APIEXPORT ze_device_handle_t ZE_APICALL zerTranslateIdentifierToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Retrieves a string describing the last error code returned by the +/// default driver in the current thread. +/// +/// @details +/// - String returned is thread local. +/// - String is only updated on calls returning an error, i.e., not on calls +/// returning ::ZE_RESULT_SUCCESS. +/// - String may be empty if driver considers error code is already explicit +/// enough to describe cause. +/// - Memory pointed to by ppString is owned by the driver. +/// - String returned is null-terminated. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == ppString` +ZE_APIEXPORT ze_result_t ZE_APICALL +zerGetLastErrorDescription( + const char **ppString ///< [in,out] pointer to a null-terminated array of characters describing + ///< cause of error. +); + +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +/// @brief Get default context associated with driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Default context contains all devices within driver instance +/// @returns +/// - Context handle associated with driver +ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver /// @brief Global device synchronization /// @@ -333,36 +668,6 @@ ze_device_handle_t ZE_APICALL zerIdentifierTranslateToDeviceHandle(uint32_t iden /// - ::ZE_RESULT_ERROR_DEVICE_LOST ze_result_t ZE_APICALL zeDeviceSynchronize(ze_device_handle_t hDevice); ///> [in] handle of the device -/// @brief Get priority levels -/// -/// @details -/// - The application may call this function from simultaneous threads. -/// - The implementation of this function should be lock-free. -/// - Returns priority levels supported by the device -/// - lowestPriority reports the numerical value that corresponds to lowest queue priority -/// - highesPriority reports the numerical value that corresponds to highest queue priority -/// - Lower numbers indicate greater priorities -/// - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority] -/// - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value -/// - 0 means default priority -/// -/// @returns -/// - ::ZE_RESULT_SUCCESS -/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER -ze_result_t ZE_APICALL zeDeviceGetPriorityLevels( - ze_device_handle_t hDevice, - int *lowestPriority, - int *highestPriority); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Descriptor used for setting priority on command queues and immediate command lists. -/// This structure may be passed as pNext member of ::ze_command_queue_desc_t. -typedef struct _ze_queue_priority_desc_t { - ze_structure_type_t stype; ///< [in] type of this structure - const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific structure - int priority; ///< [in] priority of the queue -} ze_queue_priority_desc_t; - /// @brief Append with arguments /// /// @details @@ -400,13 +705,115 @@ ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithArguments( const ze_group_count_t groupCounts, ///< [in] thread group counts const ze_group_size_t groupSizes, ///< [in] thread group sizes void **pArguments, ///< [in] kernel arguments; pointer to list where each argument represents a pointer to the argument value on specific index - void *pNext, ///< [in][optional] extensions + const void *pNext, ///< [in][optional] extensions ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching ze_event_handle_t *phWaitEvents); ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extension descriptor for cooperative kernel launch via pNext chain. +/// @details +/// - This structure can be passed through pNext to zeCommandListAppendLaunchKernelWithParameters +typedef struct _ze_command_list_append_launch_kernel_param_cooperative_desc_t { + ze_structure_type_ext_t stype; ///< [in] Type of this structure (e.g. ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC) + const void *pNext; ///< [in][optional] Pointer to the next extension-specific structure + ze_bool_t isCooperative; ///< [in] Indicates if the kernel should be launched as cooperative +} ze_command_list_append_launch_kernel_param_cooperative_desc_t; +/// @brief Append with parameters +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Appends kernel to command list with additional parameters via pNext chain. +/// - Allows passing core and extension descriptors (e.g. cooperative kernel). +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hCommandList` +/// + `nullptr == hKernel` +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == pGroupCounts` +/// - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT +/// - ::ZE_RESULT_ERROR_INVALID_SIZE +/// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` +ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithParameters( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t *pGroupCounts, ///< [in] thread group launch arguments + const void *pNext, ///< [in][optional] additional parameters (pNext chain) + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching +); + +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + #if defined(__cplusplus) } // extern "C" #endif +static const ze_device_mem_alloc_desc_t defaultIntelDeviceMemDesc = { + ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, // stype + nullptr, // pNext + ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED, // flags + 0 // ordinal +}; + +static const ze_host_mem_alloc_desc_t defaultIntelHostMemDesc = { + ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, // stype + nullptr, // pNext + ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED | ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT // flags +}; + +static const ze_command_queue_desc_t defaultIntelCommandQueueDesc = { + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, // stype + nullptr, // pNext + 0, // ordinal + 0, // index + ZE_COMMAND_QUEUE_FLAG_IN_ORDER | ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT, // flags + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, // mode + ZE_COMMAND_QUEUE_PRIORITY_NORMAL // priority +}; + +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME +/// @brief External Memory Mapping Extension Name +#define ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME "ZE_extension_external_memmap_sysmem" + +/////////////////////////////////////////////////////////////////////////////// +/// @brief External Memory Mapping Extension Version(s) +typedef enum _ze_external_memmap_sysmem_ext_version_t { + ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_FORCE_UINT32 = 0x7fffffff ///< Value marking end of ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_* ENUMs + +} ze_external_memmap_sysmem_ext_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Maps external system memory for an allocation +/// +/// @details +/// - This structure may be passed to ::zeMemAllocHost, via the `pNext` +/// member of ::ze_host_mem_alloc_desc_t to map system memory for a host +/// allocation. +/// - The system memory pointer and size being mapped must be page aligned +/// based on the supported page sizes on the device. +typedef struct _ze_external_memmap_sysmem_ext_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + const void *pSystemMemory; ///< [in] system memory pointer to map; must be page-aligned. + const uint64_t size; ///< [in] size of the system memory to map; must be page-aligned. + +} ze_external_memmap_sysmem_ext_desc_t; +#endif // ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME + +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) #endif diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/ze_stypes.h index 19b1efac292854..f0557d8dbc0c83 100644 --- a/thirdparty/level_zero/ze_stypes.h +++ b/thirdparty/level_zero/ze_stypes.h @@ -1,4 +1,4 @@ -// intel/compute-runtime e96840a03ec41659772ca0bea3338bdd688ae4b5 +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac /* * Copyright (C) 2024-2025 Intel Corporation * @@ -9,17 +9,28 @@ #ifndef _ZE_STYPES_H #define _ZE_STYPES_H -#include -#include +#include +#include +#include #include using ze_structure_type_ext_t = uint32_t; using zet_structure_type_ext_t = uint32_t; +using zes_structure_type_ext_t = uint32_t; #define ZE_STRUCTURE_TYPE_SYNCHRONIZED_DISPATCH_EXP_DESC static_cast(0x00020020) #define ZE_STRUCTURE_TYPE_INTEL_MEDIA_COMMUNICATION_DESC static_cast(0x00020021) #define ZE_STRUCTURE_TYPE_INTEL_MEDIA_DOORBELL_HANDLE_DESC static_cast(0x00020022) #define ZE_STRUCTURE_TYPE_INTEL_DEVICE_MEDIA_EXP_PROPERTIES static_cast(0x00020023) +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) +#define ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC static_cast(0x00020036) +#define ZE_STRUCTURE_TYPE_EXTERNAL_MEMMAP_SYSMEM_EXT_DESC static_cast(0x00020037) +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) +#define ZEX_STRUCTURE_TYPE_LABEL_DESCRIPTOR static_cast(0x00030002) +#define ZEX_STRUCTURE_TYPE_OPERAND_DESCRIPTOR static_cast(0x00030003) +#define ZEX_STRUCTURE_TYPE_VARIABLE_DESCRIPTOR static_cast(0x00030004) +#define ZEX_STRUCTURE_TYPE_TEMP_VARIABLE_DESCRIPTOR static_cast(0x00030005) +#define ZEX_STRUCTURE_TYPE_VARIABLE_INFO static_cast(0x00030006) #define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES static_cast(0x00030007) #define ZEX_STRUCTURE_DEVICE_MODULE_REGISTER_FILE_EXP static_cast(0x00030010) #define ZEX_STRUCTURE_KERNEL_REGISTER_FILE_SIZE_EXP static_cast(0x00030012) @@ -33,11 +44,22 @@ using zet_structure_type_ext_t = uint32_t; #define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_SYNC_ALLOC_PROPERTIES static_cast(0x0003001D) #define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES static_cast(0x00030027) #define ZE_STRUCTURE_TYPE_QUEUE_PRIORITY_DESC static_cast(0x00030028) +#ifndef ZE_RECORD_REPLAY_GRAPH_EXP_NAME +#define ZE_STRUCTURE_TYPE_RECORD_REPLAY_GRAPH_EXP_PROPERTIES static_cast(0x00030029) +#endif // ZE_RECORD_REPLAY_GRAPH_EXP_NAME +#define ZEX_STRUCTURE_TYPE_MEMORY_FREE_CALLBACK_EXT_DESC static_cast(0x00030030) // Metric structure types -#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATE_EXP_PROPERTIES static_cast(0x00010008) -#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATE_DESC_EXP static_cast(0x00010009) +#define ZET_STRUCTURE_TYPE_INTEL_METRIC_SCOPE_PROPERTIES_EXP static_cast(0x00010006) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_HW_BUFFER_SIZE_EXP_DESC static_cast(0x00010007) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATION_EXP_PROPERTIES static_cast(0x00010008) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATION_DESC_EXP static_cast(0x00010009) #define ZET_INTEL_STRUCTURE_TYPE_METRIC_SOURCE_ID_EXP static_cast(0x0001000a) #define ZET_INTEL_STRUCTURE_TYPE_METRIC_DECODED_BUFFER_PROPERTIES_EXP static_cast(0x0001000b) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULABLE_PROPERTIES_EXP static_cast(0x0001000c) + +// Sysman structure types +#define ZES_INTEL_PCI_LINK_SPEED_DOWNGRADE_EXP_STATE static_cast(0x00040001) +#define ZES_INTEL_PCI_LINK_SPEED_DOWNGRADE_EXP_PROPERTIES static_cast(0x00040002) #endif diff --git a/thirdparty/level_zero/zex_common.h b/thirdparty/level_zero/zex_common.h new file mode 100644 index 00000000000000..8a45573aaa164a --- /dev/null +++ b/thirdparty/level_zero/zex_common.h @@ -0,0 +1,241 @@ +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac +/* + * Copyright (C) 2022-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZEX_COMMON_H +#define _ZEX_COMMON_H +#if defined(__cplusplus) +#pragma once +#endif +#include "level_zero/ze_stypes.h" +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Handle of command list object +typedef ze_command_list_handle_t zex_command_list_handle_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Handle of event object +typedef ze_event_handle_t zex_event_handle_t; + +#define ZEX_BIT(_i) (1 << _i) + +typedef uint32_t zex_mem_action_scope_flags_t; +typedef enum _zex_mem_action_scope_flag_t { + ZEX_MEM_ACTION_SCOPE_FLAG_SUBDEVICE = ZEX_BIT(0), + ZEX_MEM_ACTION_SCOPE_FLAG_DEVICE = ZEX_BIT(1), + ZEX_MEM_ACTION_SCOPE_FLAG_HOST = ZEX_BIT(2), + ZEX_MEM_ACTION_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff +} zex_mem_action_scope_flag_t; + +typedef uint32_t zex_wait_on_mem_action_flags_t; +typedef enum _zex_wait_on_mem_action_flag_t { + ZEX_WAIT_ON_MEMORY_FLAG_EQUAL = ZEX_BIT(0), + ZEX_WAIT_ON_MEMORY_FLAG_NOT_EQUAL = ZEX_BIT(1), + ZEX_WAIT_ON_MEMORY_FLAG_GREATER_THAN = ZEX_BIT(2), + ZEX_WAIT_ON_MEMORY_FLAG_GREATER_THAN_EQUAL = ZEX_BIT(3), + ZEX_WAIT_ON_MEMORY_FLAG_LESSER_THAN = ZEX_BIT(4), + ZEX_WAIT_ON_MEMORY_FLAG_LESSER_THAN_EQUAL = ZEX_BIT(5), + ZEX_WAIT_ON_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff +} zex_wait_on_mem_action_flag_t; + +typedef struct _zex_wait_on_mem_desc_t { + zex_wait_on_mem_action_flags_t actionFlag; + zex_mem_action_scope_flags_t waitScope; +} zex_wait_on_mem_desc_t; + +typedef struct _zex_write_to_mem_desc_t { + zex_mem_action_scope_flags_t writeScope; +} zex_write_to_mem_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_SYNCHRONIZED_DISPATCH_EXP_NAME +/// @brief Synchronized Dispatch extension name +#define ZE_SYNCHRONIZED_DISPATCH_EXP_NAME "ZE_experimental_synchronized_dispatch" +#endif // ZE_SYNCHRONIZED_DISPATCH_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Synchronized Dispatch extension version(s) +typedef enum _ze_synchronized_dispatch_exp_version_t { + ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_synchronized_dispatch_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported synchronized dispatch flags +typedef uint32_t ze_synchronized_dispatch_exp_flags_t; +typedef enum _ze_synchronized_dispatch_exp_flag_t { + ZE_SYNCHRONIZED_DISPATCH_DISABLED_EXP_FLAG = ZE_BIT(0), ///< Non-synchronized dispatch. Must synchronize only with other synchronized dispatches + ZE_SYNCHRONIZED_DISPATCH_ENABLED_EXP_FLAG = ZE_BIT(1), ///< Synchronized dispatch. Must synchronize with all synchronized and non-synchronized dispatches + ZE_SYNCHRONIZED_DISPATCH_EXP_FLAG_FORCE_UINT32 = 0x7fffffff + +} ze_synchronized_dispatch_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_synchronized_dispatch_exp_desc_t +typedef struct _ze_synchronized_dispatch_exp_desc_t ze_synchronized_dispatch_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue or command list descriptor for synchronized dispatch. This structure may be +/// passed as pNext member of ::ze_command_queue_desc_t. or ::ze_command_list_desc_t. +typedef struct _ze_synchronized_dispatch_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + ze_synchronized_dispatch_exp_flags_t flags; ///< [in] mode flags. + ///< must be valid value of ::ze_synchronized_dispatch_exp_flag_t + +} ze_synchronized_dispatch_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_intel_media_communication_desc_t +typedef struct _ze_intel_media_communication_desc_t ze_intel_media_communication_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief ze_intel_media_communication_desc_t +typedef struct _ze_intel_media_communication_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific, this will be used to extend this in future + void *controlSharedMemoryBuffer; ///< [in] control shared memory buffer pointer, must be USM address + uint32_t controlSharedMemoryBufferSize; ///< [in] control shared memory buffer size + void *controlBatchBuffer; ///< [in] control batch buffer pointer, must be USM address + uint32_t controlBatchBufferSize; ///< [in] control batch buffer size +} ze_intel_media_communication_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_intel_media_doorbell_handle_desc_t +typedef struct _ze_intel_media_doorbell_handle_desc_t ze_intel_media_doorbell_handle_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief ze_intel_media_doorbell_handle_desc_t +/// @details Handle of the doorbell. This structure is passed as argument of zeIntelMediaCommunicationCreate and zeIntelMediaCommunicationDestroy +typedef struct _ze_intel_media_doorbell_handle_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific, this will be used to extend this in future + void *doorbell; ///< [in,out] handle of the doorbell +} ze_intel_media_doorbell_handle_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported device media flags +typedef uint32_t ze_intel_device_media_exp_flags_t; +typedef enum _ze_intel_device_media_exp_flag_t { + ZE_INTEL_DEVICE_MEDIA_SUPPORTS_ENCODING_EXP_FLAG = ZE_BIT(0), ///< Supports encoding + ZE_INTEL_DEVICE_MEDIA_SUPPORTS_DECODING_EXP_FLAG = ZE_BIT(1), ///< Supports decoding + ZE_INTEL_DEVICE_MEDIA_EXP_FLAG_FORCE_UINT32 = 0x7fffffff +} ze_intel_device_media_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_intel_device_media_exp_properties_t +typedef struct _ze_intel_device_media_exp_properties_t ze_intel_device_media_exp_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief May be passed to ze_device_properties_t through pNext. +typedef struct _ze_intel_device_media_exp_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ze_intel_device_media_exp_flags_t flags; ///< [out] device media flags + uint32_t numEncoderCores; ///< [out] number of encoder cores + uint32_t numDecoderCores; ///< [out] number of decoder cores +} ze_intel_device_media_exp_properties_t; + +#ifndef ZEX_COUNTER_BASED_EVENT_EXT_NAME +/// @brief Counter Based Event Extension Name +#define ZEX_COUNTER_BASED_EVENT_EXT_NAME "ZEX_counter_based_event" +#endif // ZEX_COUNTER_BASED_EVENT_EXT_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Counter Based Event Extension Version(s) +typedef enum _zex_counter_based_event_version_t { + ZEX_COUNTER_BASED_EVENT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_COUNTER_BASED_EVENT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_COUNTER_BASED_EVENT_VERSION_FORCE_UINT32 = 0x7fffffff + +} zex_counter_based_event_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief IPC handle to counter based event +typedef struct _zex_ipc_counter_based_event_handle_t { + char data[ZE_MAX_IPC_HANDLE_SIZE]; ///< [out] Opaque data representing an IPC handle +} zex_ipc_counter_based_event_handle_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported event flags for defining counter-based event +typedef uint32_t zex_counter_based_event_exp_flags_t; +typedef enum _zex_counter_based_event_exp_flag_t { + ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE = ZE_BIT(0), ///< Counter-based event is used for immediate command lists (default) + ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE = ZE_BIT(1), ///< Counter-based event is used for non-immediate command lists + ZEX_COUNTER_BASED_EVENT_FLAG_HOST_VISIBLE = ZE_BIT(2), ///< Signals and waits are also visible to host + ZEX_COUNTER_BASED_EVENT_FLAG_IPC = ZE_BIT(3), ///< Event can be shared across processes for waiting + ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP = ZE_BIT(4), ///< Event contains kernel timestamps + ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_MAPPED_TIMESTAMP = ZE_BIT(5), ///< Event contains kernel timestamps synchronized to host time domain. + ///< Cannot be combined with::ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP + ZEX_COUNTER_BASED_EVENT_FLAG_GRAPH_EXTERNAL_EVENT = ZE_BIT(6), ///< Event when is used in graph record & replay, can be used outside + ///< recorded graph for synchronization (using as wait event or for host synchronization) + ZEX_COUNTER_BASED_EVENT_FLAG_FORCE_UINT32 = 0x7fffffff + +} zex_counter_based_event_exp_flag_t; + +typedef struct _zex_counter_based_event_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + zex_counter_based_event_exp_flags_t flags; ///< [in] counter based event flags. + ///< Must be 0 (default) or a valid combination of ::zex_counter_based_event_exp_flag_t + ze_event_scope_flags_t signalScope; ///< [in] defines the scope of relevant cache hierarchies to flush on a + ///< signal action before the event is triggered. + ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t; + ///< default behavior is synchronization within the command list only, no + ///< additional cache hierarchies are flushed. + ze_event_scope_flags_t waitScope; ///< [in] defines the scope of relevant cache hierarchies to invalidate on + ///< a wait action after the event is complete. + ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t; + ///< default behavior is synchronization within the command list only, no + ///< additional cache hierarchies are invalidated. +} zex_counter_based_event_desc_t; + +static const zex_counter_based_event_desc_t defaultIntelCounterBasedEventDesc = { + ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC, // stype + nullptr, // pNext + ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE | + ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE | + ZEX_COUNTER_BASED_EVENT_FLAG_HOST_VISIBLE, // flags + ZE_EVENT_SCOPE_FLAG_HOST, // signalScope + ZE_EVENT_SCOPE_FLAG_DEVICE // waitScope +}; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Initial Counter Based Event synchronization parameters. This structure may be +/// passed as pNext member of ::zex_counter_based_event_desc_t. +typedef struct _zex_counter_based_event_external_sync_alloc_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + uint64_t *deviceAddress; ///< [in] device address for external synchronization allocation + uint64_t *hostAddress; ///< [in] host address for external synchronization allocation + uint64_t completionValue; ///< [in] completion value for external synchronization allocation +} zex_counter_based_event_external_sync_alloc_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Initial Counter Based Event synchronization parameters. This structure may be +/// passed as pNext member of ::zex_counter_based_event_desc_t. +typedef struct _zex_counter_based_event_external_storage_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + uint64_t *deviceAddress; ///< [in] device address that would be updated with atomic_add upon signaling of this event, must be device USM memory + uint64_t incrementValue; ///< [in] value which would by atomically added upon each completion + uint64_t completionValue; ///< [in] final completion value, when value under deviceAddress is equal or greater then this value then event is considered as completed +} zex_counter_based_event_external_storage_properties_t; + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif // _ZEX_COMMON_EXTENDED_H diff --git a/thirdparty/level_zero/zex_event.h b/thirdparty/level_zero/zex_event.h new file mode 100644 index 00000000000000..68b0bfe18bd573 --- /dev/null +++ b/thirdparty/level_zero/zex_event.h @@ -0,0 +1,58 @@ +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac +/* + * Copyright (C) 2023-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZEX_EVENT_H +#define _ZEX_EVENT_H +#if defined(__cplusplus) +#pragma once +#endif + +#include + +#include "zex_common.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +ZE_APIEXPORT ze_result_t ZE_APICALL +zexEventGetDeviceAddress( + ze_event_handle_t event, + uint64_t *completionValue, + uint64_t *address); + +// deprecated +ZE_APIEXPORT ze_result_t ZE_APICALL +zexCounterBasedEventCreate( + ze_context_handle_t hContext, + ze_device_handle_t hDevice, + uint64_t *deviceAddress, + uint64_t *hostAddress, + uint64_t completionValue, + const ze_event_desc_t *desc, + ze_event_handle_t *phEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexIntelAllocateNetworkInterrupt(ze_context_handle_t hContext, uint32_t &networkInterruptId); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexIntelReleaseNetworkInterrupt(ze_context_handle_t hContext, uint32_t networkInterruptId); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDevice, const zex_counter_based_event_desc_t *desc, ze_event_handle_t *phEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventGetIpcHandle(ze_event_handle_t hEvent, zex_ipc_counter_based_event_handle_t *phIpc); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventOpenIpcHandle(ze_context_handle_t hContext, zex_ipc_counter_based_event_handle_t hIpc, ze_event_handle_t *phEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventCloseIpcHandle(ze_event_handle_t hEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexDeviceGetAggregatedCopyOffloadIncrementValue(ze_device_handle_t hDevice, uint32_t *incrementValue); + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif // _ZEX_EVENT_H From 3993c80ef44838345e78ed9a3c19c33b17af8ec8 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 26 Sep 2025 11:33:57 +0000 Subject: [PATCH 24/74] Fix includes --- thirdparty/level_zero/ze_intel_gpu.h | 2 +- thirdparty/level_zero/ze_stypes.h | 6 +++--- thirdparty/level_zero/zex_common.h | 4 ++-- thirdparty/level_zero/zex_event.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/ze_intel_gpu.h index 350be4de550d74..d7c9f3f56aa6bf 100644 --- a/thirdparty/level_zero/ze_intel_gpu.h +++ b/thirdparty/level_zero/ze_intel_gpu.h @@ -9,7 +9,7 @@ #ifndef _ZE_INTEL_GPU_H #define _ZE_INTEL_GPU_H -#include +#include #include "ze_stypes.h" diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/ze_stypes.h index f0557d8dbc0c83..50c9dbaa4e35c2 100644 --- a/thirdparty/level_zero/ze_stypes.h +++ b/thirdparty/level_zero/ze_stypes.h @@ -9,9 +9,9 @@ #ifndef _ZE_STYPES_H #define _ZE_STYPES_H -#include -#include -#include +#include +#include +#include #include using ze_structure_type_ext_t = uint32_t; diff --git a/thirdparty/level_zero/zex_common.h b/thirdparty/level_zero/zex_common.h index 8a45573aaa164a..7c8f050c38166e 100644 --- a/thirdparty/level_zero/zex_common.h +++ b/thirdparty/level_zero/zex_common.h @@ -11,8 +11,8 @@ #if defined(__cplusplus) #pragma once #endif -#include "level_zero/ze_stypes.h" -#include +#include "ze_stypes.h" +#include #if defined(__cplusplus) extern "C" { diff --git a/thirdparty/level_zero/zex_event.h b/thirdparty/level_zero/zex_event.h index 68b0bfe18bd573..ec38aaeba9bd80 100644 --- a/thirdparty/level_zero/zex_event.h +++ b/thirdparty/level_zero/zex_event.h @@ -12,7 +12,7 @@ #pragma once #endif -#include +#include #include "zex_common.h" From 078d3a3397ceb93dd88ca03b7390a76a021cfc42 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 26 Sep 2025 14:49:42 +0000 Subject: [PATCH 25/74] Add initial counter based event implementation --- .../src/runtime/ze/ze_cb_event_manager.cpp | 43 ++++++++++++++++ .../src/runtime/ze/ze_cb_event_manager.hpp | 22 ++++++++ .../intel_gpu/src/runtime/ze/ze_event.cpp | 16 +++--- .../intel_gpu/src/runtime/ze/ze_event.hpp | 9 ++-- .../src/runtime/ze/ze_event_manager.hpp | 32 ++++++++++++ .../src/runtime/ze/ze_event_pool.cpp | 50 ------------------- .../src/runtime/ze/ze_event_pool.hpp | 20 +------- .../src/runtime/ze/ze_event_pool_manager.cpp | 45 +++++++++++++++++ .../src/runtime/ze/ze_event_pool_manager.hpp | 26 ++++++++++ .../intel_gpu/src/runtime/ze/ze_stream.cpp | 19 +++++-- .../intel_gpu/src/runtime/ze/ze_stream.hpp | 5 +- 11 files changed, 198 insertions(+), 89 deletions(-) create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp new file mode 100644 index 00000000000000..20cc68518a37a6 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_cb_event_manager.hpp" +#include "ze_common.hpp" +#include "ze_event.hpp" + +#include "zex_event.h" + +using namespace cldnn; +using namespace ze; +namespace { + decltype(zexCounterBasedEventCreate2) *func_zexCounterBasedEventCreate2 = nullptr; + void find_function_address(ze_driver_handle_t driver) { + ZE_CHECK(zeDriverGetExtensionFunctionAddress(driver, + "zexCounterBasedEventCreate2", + reinterpret_cast(&func_zexCounterBasedEventCreate2))); + } +} + +ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, bool enable_profiling) + : ze_event_manager(engine, enable_profiling) { + if (func_zexCounterBasedEventCreate2 == nullptr) { + find_function_address(engine.get_driver()); + } +} + +ze_cb_event_manager::~ze_cb_event_manager() {} + +std::shared_ptr ze_cb_event_manager::create_event(uint64_t queue_stamp) { + ze_event_handle_t event; + auto desc = defaultIntelCounterBasedEventDesc; + if (m_enable_profiling) { + desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP; + } + ZE_CHECK(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event)); + return std::make_shared(this, event, queue_stamp); +} + +void ze_cb_event_manager::destroy_event(ze_event *event) { + zeEventDestroy(event->get()); +} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp new file mode 100644 index 00000000000000..5e0c2a753cf271 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_event_manager.hpp" + +namespace cldnn { +namespace ze { + +// Interface for creating and destroying l0 counter based events +// Should only be used with in-order queue +struct ze_cb_event_manager : public ze_event_manager { +public: + ze_cb_event_manager(const ze_engine &engine, bool enable_profiling); + ~ze_cb_event_manager(); + std::shared_ptr create_event(uint64_t queue_stamp) override; + void destroy_event(ze_event *event) override; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp index c5e39be435798f..fd65c57805dffb 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -47,11 +47,8 @@ bool ze_event::is_set_impl() { } bool ze_event::is_profiled() const { - if (m_event != nullptr) { - ze_event_pool_flags_t event_pool_flags; - auto ev_pool = m_event_pool.get()->m_handle; - ZE_CHECK(zeEventPoolGetFlags(ev_pool, &event_pool_flags)); - return (event_pool_flags & ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) != 0; + if (m_event_manager) { + return m_event_manager->is_profiling_enabled(); } return false; } @@ -61,7 +58,7 @@ bool ze_event::get_profiling_info_impl(std::listm_engine; + const auto& engine = m_event_manager->get_engine(); auto device_info = engine.get_device_info(); ze_kernel_timestamp_result_t timestamp{}; @@ -146,7 +143,7 @@ bool ze_events::get_profiling_info_impl(std::list(_events.front().get())->m_event_pool->m_engine; + const auto& engine = downcast(_events.front().get())->m_event_manager->get_engine(); auto device_info = engine.get_device_info(); auto get_total_exec_time = [&device_info](std::vector& all_timestamps) { @@ -210,7 +207,6 @@ bool ze_events::get_profiling_info_impl(std::listdestroy_event(this); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp index 5120cf9120ec29..b10d2808698775 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp @@ -5,6 +5,7 @@ #pragma once #include "ze_base_event.hpp" +#include "ze_event_manager.hpp" #include "ze_event_pool.hpp" #include @@ -15,9 +16,10 @@ namespace ze { struct ze_event : public ze_base_event { public: - ze_event(ze_event_pool::ptr ev_pool, ze_event_handle_t ev, uint64_t queue_stamp = 0) + ze_event(ze_event_manager *ev_manager, ze_event_handle_t ev, uint64_t queue_stamp = 0, std::shared_ptr event_pool = nullptr) : ze_base_event(queue_stamp) - , m_event_pool(ev_pool) + , m_event_manager(ev_manager) + , m_event_pool(event_pool) , m_event(ev) {} ze_event_handle_t get() override { return m_event; } @@ -34,7 +36,8 @@ struct ze_event : public ze_base_event { friend struct ze_events; protected: - ze_event_pool::ptr m_event_pool; + ze_event_manager *m_event_manager; + std::shared_ptr m_event_pool = nullptr; ze_event_handle_t m_event; }; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp new file mode 100644 index 00000000000000..f2bc0975d7b181 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "ze_engine.hpp" + +namespace cldnn { +namespace ze { + +struct ze_event; + +// Interface for creating and destroying Level Zero events +struct ze_event_manager { +public: + using ptr = std::shared_ptr; + ze_event_manager(const ze_engine &engine, bool enable_profiling) : m_engine(engine), m_enable_profiling(enable_profiling) {} + const ze_engine& get_engine() const { return m_engine; } + bool is_profiling_enabled() const { return m_enable_profiling; } + + virtual ~ze_event_manager() {} + virtual std::shared_ptr create_event(uint64_t queue_stamp) = 0; + virtual void destroy_event(ze_event *event) = 0; +protected: + const ze_engine& m_engine; + bool m_enable_profiling; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp index 1ae696e0d10824..c83179bf6d720d 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp @@ -24,55 +24,5 @@ ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_even ze_event_pool::~ze_event_pool() { zeEventPoolDestroy(m_handle); } - -ze_events_pool::ze_events_pool(const ze_engine& engine, bool enable_profiling) - : m_engine(engine) - , m_enable_profiling(enable_profiling) { } - -std::shared_ptr ze_events_pool::create_event(uint64_t queue_stamp) { - if (m_num_used >= m_capacity || !m_current_pool) { - m_num_used = 0; - ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; - flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; - m_current_pool = std::make_shared(m_engine, m_capacity, flags); - } - - ze_event_handle_t event; - // set signal scope to host to allow wait on the host - // TODO: avoid setting scope when event is not used for wait on the host - ze_event_desc_t event_desc = { - ZE_STRUCTURE_TYPE_EVENT_DESC, - nullptr, - m_num_used++, - ZE_EVENT_SCOPE_FLAG_HOST, - 0 - }; - ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event)); - - return std::make_shared(m_current_pool, event, queue_stamp); -} - -std::shared_ptr ze_events_pool::create_user_event() { - if (m_num_used_user >= m_capacity || !m_current_user_pool) { - m_num_used_user = 0; - ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; - flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; - m_current_user_pool = std::make_shared(m_engine, m_capacity, flags); - } - // set signal scope to host to allow wait on the host - // TODO: avoid setting scope when event is not used for wait on the host - ze_event_handle_t event; - ze_event_desc_t event_desc = { - ZE_STRUCTURE_TYPE_EVENT_DESC, - nullptr, - m_num_used_user++, - ZE_EVENT_SCOPE_FLAG_HOST, - 0 - }; - ZE_CHECK(zeEventCreate(m_current_user_pool->m_handle, &event_desc, &event)); - - return std::make_shared(m_current_user_pool, event); -} - } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp index 197e41b862a422..cec1448cf8be8d 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp @@ -23,24 +23,6 @@ struct ze_event_pool { const ze_engine& m_engine; }; -// Helper for events pool management -// Can hold multiple ze_event_pool objects and track their capacity with realloc when it's needed -struct ze_events_pool { -public: - ze_events_pool(const ze_engine& engine, bool enable_profiling); - - std::shared_ptr create_event(uint64_t queue_stamp = 0); - std::shared_ptr create_user_event(); - -protected: - const ze_engine& m_engine; - std::shared_ptr m_current_user_pool = nullptr; - std::shared_ptr m_current_pool = nullptr; - const uint32_t m_capacity = 100; - uint32_t m_num_used = 0; - uint32_t m_num_used_user = 0; - const bool m_enable_profiling; -}; - } // namespace ze } // namespace cldnn + diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp new file mode 100644 index 00000000000000..4cd744feb77c21 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_event_pool_manager.hpp" +#include "ze_common.hpp" +#include "ze_event.hpp" + +#include "zex_event.h" + +using namespace cldnn; +using namespace ze; + +ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity) +: ze_event_manager(engine, enable_profiling) +, m_current_pool(nullptr) +, m_capacity(capacity) +, m_num_used(0) {} + +ze_event_pool_manager::~ze_event_pool_manager() {} + +std::shared_ptr ze_event_pool_manager::create_event(uint64_t queue_stamp) { + if (m_num_used >= m_capacity || !m_current_pool) { + m_num_used = 0; + ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; + flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + m_current_pool = std::make_shared(m_engine, m_capacity, flags); + } + + ze_event_handle_t event; + ze_event_desc_t event_desc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + m_num_used++, + ZE_EVENT_SCOPE_FLAG_HOST, + 0 + }; + ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event)); + + return std::make_shared(this, event, queue_stamp, m_current_pool); +} + +void ze_event_pool_manager::destroy_event(ze_event *event) { + zeEventDestroy(event->get()); +} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp new file mode 100644 index 00000000000000..1fde599001c7fd --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_event_manager.hpp" +#include "ze_event_pool.hpp" + +namespace cldnn { +namespace ze { + +// Interface for creating and destroying l0 events using event pools +struct ze_event_pool_manager : public ze_event_manager { +public: + ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255); + ~ze_event_pool_manager(); + std::shared_ptr create_event(uint64_t queue_stamp) override; + void destroy_event(ze_event *event) override; +protected: + std::shared_ptr m_current_pool; + const uint32_t m_capacity; + uint32_t m_num_used; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 1e865cb421d252..42450c7f6256c5 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -8,7 +8,11 @@ #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/properties.hpp" -#include "ze_event_pool.hpp" + +#include "ze_event_manager.hpp" +#include "ze_cb_event_manager.hpp" +#include "ze_event_pool_manager.hpp" + #include "ze_event.hpp" #include "ze_kernel.hpp" #include "ze_memory.hpp" @@ -185,8 +189,7 @@ void set_arguments_impl(ze_kernel_handle_t kernel, ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) : stream(config.get_queue_type(), stream::get_expected_sync_method(config)) - , _engine(engine) - , m_pool(engine, config.get_enable_profiling()) { + , _engine(engine) { const auto &info = engine.get_device_info(); ze_command_queue_desc_t command_queue_desc = {}; @@ -201,6 +204,11 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); command_queue_desc.ordinal = info.copy_queue_group_ordinal; ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list)); + if (m_queue_type == QueueTypes::in_order) { + m_ev_manager = std::make_unique(engine, config.get_enable_profiling()); + } else { + m_ev_manager = std::make_unique(engine, config.get_enable_profiling()); + } } ze_stream::~ze_stream() { @@ -306,7 +314,8 @@ void ze_stream::wait() { } event::ptr ze_stream::create_user_event(bool set) { - auto ev = m_pool.create_user_event(); + // user event should use different api + auto ev = m_ev_manager->create_event(++m_queue_counter); if (set) ev->set(); @@ -314,7 +323,7 @@ event::ptr ze_stream::create_user_event(bool set) { } event::ptr ze_stream::create_base_event() { - return m_pool.create_event(++m_queue_counter); + return m_ev_manager->create_event(++m_queue_counter); } void ze_stream::flush() const { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp index e490ee67e864ec..2888ab696cec0d 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -9,6 +9,7 @@ #include "ze_common.hpp" #include "ze_engine.hpp" #include "ze_event.hpp" +#include "ze_event_manager.hpp" namespace cldnn { namespace ze { @@ -27,7 +28,7 @@ class ze_stream : public stream { , m_queue_counter(other.m_queue_counter.load()) , m_last_barrier(other.m_last_barrier.load()) , m_last_barrier_ev(other.m_last_barrier_ev) - , m_pool(other.m_pool) { + , m_ev_manager(other.m_ev_manager.release()) { other.m_command_list = nullptr; other.m_copy_command_list = nullptr; } @@ -64,7 +65,7 @@ class ze_stream : public stream { mutable std::atomic m_queue_counter{0}; std::atomic m_last_barrier{0}; std::shared_ptr m_last_barrier_ev = nullptr; - ze_events_pool m_pool; + std::unique_ptr m_ev_manager; #ifdef ENABLE_ONEDNN_FOR_GPU std::shared_ptr _onednn_stream = nullptr; From 08c1555d02fae79cc8d5813f5ad87c1aaf45d969 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 3 Oct 2025 15:19:43 +0000 Subject: [PATCH 26/74] workaround for cb event host signal --- .../intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp | 4 ++-- .../intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp | 6 +++++- src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp | 5 ++++- .../intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp | 4 ++-- .../intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 4 ++-- 7 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp index 20cc68518a37a6..936f25a6a70822 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp @@ -19,8 +19,8 @@ namespace { } } -ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, bool enable_profiling) - : ze_event_manager(engine, enable_profiling) { +ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling) + : ze_event_manager(engine, cmd_list, enable_profiling) { if (func_zexCounterBasedEventCreate2 == nullptr) { find_function_address(engine.get_driver()); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp index 5e0c2a753cf271..a5b43580da2b44 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp @@ -13,7 +13,7 @@ namespace ze { // Should only be used with in-order queue struct ze_cb_event_manager : public ze_event_manager { public: - ze_cb_event_manager(const ze_engine &engine, bool enable_profiling); + ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling); ~ze_cb_event_manager(); std::shared_ptr create_event(uint64_t queue_stamp) override; void destroy_event(ze_event *event) override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp index fd65c57805dffb..d7df10b74371c7 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -35,7 +35,11 @@ void ze_event::wait_impl() { void ze_event::set_impl() { if (m_event != nullptr) { - ZE_CHECK(zeEventHostSignal(m_event)); + if (m_event_manager != nullptr) { + zeCommandListAppendSignalEvent(m_event_manager->get_cmd_list(), m_event); + } else { + ZE_CHECK(zeEventHostSignal(m_event));// Does not work with cb events + } } } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp index f2bc0975d7b181..c852fc295b885c 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp @@ -17,15 +17,18 @@ struct ze_event; struct ze_event_manager { public: using ptr = std::shared_ptr; - ze_event_manager(const ze_engine &engine, bool enable_profiling) : m_engine(engine), m_enable_profiling(enable_profiling) {} + ze_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling) + : m_engine(engine), m_cmd_list(cmd_list), m_enable_profiling(enable_profiling) {} const ze_engine& get_engine() const { return m_engine; } bool is_profiling_enabled() const { return m_enable_profiling; } + ze_command_list_handle_t get_cmd_list() { return m_cmd_list; } virtual ~ze_event_manager() {} virtual std::shared_ptr create_event(uint64_t queue_stamp) = 0; virtual void destroy_event(ze_event *event) = 0; protected: const ze_engine& m_engine; + ze_command_list_handle_t m_cmd_list; bool m_enable_profiling; }; } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp index 4cd744feb77c21..a437eb69d54c2a 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp @@ -11,8 +11,8 @@ using namespace cldnn; using namespace ze; -ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity) -: ze_event_manager(engine, enable_profiling) +ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity) +: ze_event_manager(engine, cmd_list, enable_profiling) , m_current_pool(nullptr) , m_capacity(capacity) , m_num_used(0) {} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp index 1fde599001c7fd..1cac8585e2ea19 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp @@ -13,7 +13,7 @@ namespace ze { // Interface for creating and destroying l0 events using event pools struct ze_event_pool_manager : public ze_event_manager { public: - ze_event_pool_manager(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255); + ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity = 255); ~ze_event_pool_manager(); std::shared_ptr create_event(uint64_t queue_stamp) override; void destroy_event(ze_event *event) override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 42450c7f6256c5..374f5dbb342985 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -205,9 +205,9 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) command_queue_desc.ordinal = info.copy_queue_group_ordinal; ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list)); if (m_queue_type == QueueTypes::in_order) { - m_ev_manager = std::make_unique(engine, config.get_enable_profiling()); + m_ev_manager = std::make_unique(engine, m_command_list, config.get_enable_profiling()); } else { - m_ev_manager = std::make_unique(engine, config.get_enable_profiling()); + m_ev_manager = std::make_unique(engine, m_command_list, config.get_enable_profiling()); } } From 31311c77b050cc81006ecffd3e7ba6fed4956fcb Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 9 Oct 2025 10:04:45 +0000 Subject: [PATCH 27/74] Rework level zero event implementation --- .../include/intel_gpu/runtime/device_info.hpp | 3 +- .../intel_gpu/src/runtime/ocl/ocl_device.cpp | 1 + .../src/runtime/ze/ze_base_event.hpp | 30 ++- .../src/runtime/ze/ze_base_event_factory.hpp | 28 +++ .../intel_gpu/src/runtime/ze/ze_cb_event.cpp | 72 ++++++ .../intel_gpu/src/runtime/ze/ze_cb_event.hpp | 47 ++++ ...nt_manager.cpp => ze_cb_event_factory.cpp} | 21 +- .../src/runtime/ze/ze_cb_event_factory.hpp | 20 ++ .../src/runtime/ze/ze_cb_event_manager.hpp | 22 -- .../intel_gpu/src/runtime/ze/ze_common.hpp | 7 + .../intel_gpu/src/runtime/ze/ze_device.cpp | 3 + .../src/runtime/ze/ze_empty_event.hpp | 31 +++ .../intel_gpu/src/runtime/ze/ze_event.cpp | 205 +++--------------- .../intel_gpu/src/runtime/ze/ze_event.hpp | 88 ++------ ..._pool_manager.cpp => ze_event_factory.cpp} | 20 +- .../src/runtime/ze/ze_event_factory.hpp | 24 ++ .../src/runtime/ze/ze_event_manager.hpp | 35 --- .../src/runtime/ze/ze_event_pool.hpp | 9 +- .../src/runtime/ze/ze_event_pool_manager.hpp | 26 --- .../intel_gpu/src/runtime/ze/ze_events.cpp | 155 +++++++++++++ .../intel_gpu/src/runtime/ze/ze_events.hpp | 70 ++++++ .../intel_gpu/src/runtime/ze/ze_memory.cpp | 10 +- .../intel_gpu/src/runtime/ze/ze_stream.cpp | 44 ++-- .../intel_gpu/src/runtime/ze/ze_stream.hpp | 6 +- 24 files changed, 588 insertions(+), 389 deletions(-) create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event_manager.cpp => ze_cb_event_factory.cpp} (63%) create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp rename src/plugins/intel_gpu/src/runtime/ze/{ze_event_pool_manager.cpp => ze_event_factory.cpp} (52%) create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp delete mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 227b51d9b2ecb3..48ca01cb64f1cf 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -115,12 +115,13 @@ struct device_info { bool supports_imad; ///< Does engine support int8 mad. bool supports_immad; ///< Does engine support int8 multi mad. - bool supports_mutable_command_list; ///< Does the target runtime/device support mutable command list feature + bool supports_mutable_command_list; ///< [L0] Does the target runtime/device support mutable command list feature bool supports_usm; ///< Does engine support unified shared memory. bool has_separate_cache; ///< Does the target hardware has separate cache for usm_device and usm_host bool supports_cp_offload; ///< [L0] Does the command queue support copy offload + bool supports_cb_events; ///< [L0] Does the target runtime support counter based events std::vector supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 09a8d8794d1c68..bacddd616d948e 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -349,6 +349,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex info.compute_queue_group_ordinal = 0; info.device_memory_ordinal = 0; info.supports_cp_offload = false; + info.supports_cb_events = false; #if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) using namespace dnnl::impl::gpu::intel::jit; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp index 51c69202678fba..0f121e0b8e45b6 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp @@ -5,20 +5,40 @@ #pragma once #include "intel_gpu/runtime/event.hpp" +#include "ze_base_event_factory.hpp" + #include +#include +#include namespace cldnn { namespace ze { +// Base interface for Level Zero events struct ze_base_event : public event { public: - explicit ze_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { } - uint64_t get_queue_stamp() const { return _queue_stamp; } - void set_queue_stamp(uint64_t val) { _queue_stamp = val; } - virtual ze_event_handle_t get() = 0; + explicit ze_base_event(uint64_t queue_stamp) + : event() + , m_queue_stamp(queue_stamp) { } + uint64_t get_queue_stamp() const { return m_queue_stamp; } + void set_queue_stamp(uint64_t val) { m_queue_stamp = val; } + + virtual ze_event_handle_t get_handle() const = 0; + virtual std::optional query_timestamp() = 0; protected: - uint64_t _queue_stamp = 0; + uint64_t m_queue_stamp = 0; + + static std::chrono::nanoseconds timestamp_to_duration(const device_info &info, const ze_kernel_timestamp_data_t& timestamp) { + constexpr double NS_IN_SEC = 1000000000.0; + const double timestamp_freq = NS_IN_SEC / info.timer_resolution; + const uint64_t timestamp_max_value = ~(-1L << info.kernel_timestamp_valid_bits); + + auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ? + (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq + : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq; + return std::chrono::nanoseconds(static_cast(d)); + } }; } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp new file mode 100644 index 00000000000000..8a3febc28f0393 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_engine.hpp" +#include "intel_gpu/runtime/event.hpp" + +namespace cldnn { +namespace ze { + +// Interface for creating Level Zero events +struct ze_base_event_factory { +public: + ze_base_event_factory(const ze_engine &engine, bool enable_profiling) + : m_engine(engine), m_profiling_enabled(enable_profiling) {} + const ze_engine& get_engine() const { return m_engine; } + bool is_profiling_enabled() const { return m_profiling_enabled; } + + virtual ~ze_base_event_factory() {} + virtual event::ptr create_event(uint64_t queue_stamp) = 0; +protected: + const ze_engine& m_engine; + const bool m_profiling_enabled; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp new file mode 100644 index 00000000000000..c7f71f00a394dc --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_cb_event.hpp" +#include "ze/ze_common.hpp" + +#include +#include +#include + +using namespace cldnn; +using namespace ze; + +void ze_cb_event::wait_impl() { + ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout)); +} + +void ze_cb_event::set_impl() { + // Counter based events start in signaled state and can not be signaled from host +} + +bool ze_cb_event::is_set_impl() { + auto ret = zeEventQueryStatus(m_event); + switch (ret) { + case ZE_RESULT_SUCCESS: + return true; + break; + case ZE_RESULT_NOT_READY: + return false; + break; + default: + OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret)); + break; + } +} + +ze_event_handle_t ze_cb_event::get_handle() const { + return m_event; +} + +std::optional ze_cb_event::query_timestamp() { + if (!m_factory.is_profiling_enabled()) { + return std::nullopt; + } + ze_kernel_timestamp_result_t timestamp{}; + ZE_CHECK(zeEventQueryKernelTimestamp(m_event, ×tamp)); + return timestamp; +} + +bool ze_cb_event::get_profiling_info_impl(std::list& info) { + auto opt_timestamp = query_timestamp(); + if (!opt_timestamp.has_value()) { + return true; + } + ze_kernel_timestamp_result_t timestamp = opt_timestamp.value(); + auto &dev_info = m_factory.get_engine().get_device_info(); + auto wallclock_time = timestamp_to_duration(dev_info, timestamp.global); + auto exec_time = timestamp_to_duration(dev_info, timestamp.context); + + auto period_exec = std::make_shared(timestamp_to_duration(dev_info, timestamp.context)); + auto period_submit = std::make_shared(wallclock_time - exec_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} + +ze_cb_event::~ze_cb_event() { + ZE_WARN(zeEventDestroy(m_event)); +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp new file mode 100644 index 00000000000000..08cc5ea34b8d9c --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" + +namespace cldnn { +namespace ze { + + +// L0 counter based event. +// Signaled state is inferred from the number of tasks completed on device. +// Resetting counter based event is not allowed. +// Signaling counter based event from host is not allowed. +// Can only be used with in-order command lists. +struct ze_cb_event : public ze_base_event { +public: + // Take ownership of counter based event handle + ze_cb_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev) + : ze_base_event(queue_stamp) + , m_factory(factory) + , m_event(ev) { + // Ensure event handle is not null + if (ev == nullptr) { + OPENVINO_THROW("[GPU] Trying to create event with null handle"); + } + } + ze_cb_event(const ze_cb_event&) = delete; + ze_cb_event& operator=(const ze_cb_event&) = delete; + ~ze_cb_event(); + + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + ze_event_handle_t get_handle() const override; + std::optional query_timestamp() override; + bool get_profiling_info_impl(std::list& info) override; + +protected: + const ze_base_event_factory& m_factory; + ze_event_handle_t m_event; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp similarity index 63% rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp rename to src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp index 936f25a6a70822..d42727f6584a9d 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.cpp @@ -2,9 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ze_cb_event_manager.hpp" +#include "ze_cb_event_factory.hpp" #include "ze_common.hpp" -#include "ze_event.hpp" +#include "ze_cb_event.hpp" #include "zex_event.h" @@ -19,25 +19,20 @@ namespace { } } -ze_cb_event_manager::ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling) - : ze_event_manager(engine, cmd_list, enable_profiling) { +ze_cb_event_factory::ze_cb_event_factory(const ze_engine &engine, bool enable_profiling) + : ze_base_event_factory(engine, enable_profiling) { if (func_zexCounterBasedEventCreate2 == nullptr) { find_function_address(engine.get_driver()); } } -ze_cb_event_manager::~ze_cb_event_manager() {} - -std::shared_ptr ze_cb_event_manager::create_event(uint64_t queue_stamp) { +event::ptr ze_cb_event_factory::create_event(uint64_t queue_stamp) { ze_event_handle_t event; auto desc = defaultIntelCounterBasedEventDesc; - if (m_enable_profiling) { + if (is_profiling_enabled()) { desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP; } ZE_CHECK(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event)); - return std::make_shared(this, event, queue_stamp); -} - -void ze_cb_event_manager::destroy_event(ze_event *event) { - zeEventDestroy(event->get()); + auto cb_event = std::make_shared(queue_stamp, *this, event); + return cb_event; } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp new file mode 100644 index 00000000000000..0c6bceaacbd57b --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event_factory.hpp" + +namespace cldnn { +namespace ze { + +// Interface for creating l0 counter based events +// Should only be used with in-order queue +struct ze_cb_event_factory : public ze_base_event_factory { +public: + ze_cb_event_factory(const ze_engine &engine, bool enable_profiling); + event::ptr create_event(uint64_t queue_stamp) override; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp deleted file mode 100644 index a5b43580da2b44..00000000000000 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_manager.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ze_event_manager.hpp" - -namespace cldnn { -namespace ze { - -// Interface for creating and destroying l0 counter based events -// Should only be used with in-order queue -struct ze_cb_event_manager : public ze_event_manager { -public: - ze_cb_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling); - ~ze_cb_event_manager(); - std::shared_ptr create_event(uint64_t queue_stamp) override; - void destroy_event(ze_event *event) override; -}; -} // namespace ze -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp index fc7f98810611e7..36d820139be816 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -15,6 +15,13 @@ } \ } while (false) +#define ZE_WARN(f) \ + do { \ + ze_result_t res_ = (f); \ + if (res_ != ZE_RESULT_SUCCESS) { \ + GPU_DEBUG_COUT << ("[Warning] [GPU] " #f " command failed with code " + std::to_string(res_)); \ + } \ + } while (false) namespace cldnn { namespace ze { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index 6cef4bf50c0e5c..0300ba9ff93f5b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -4,6 +4,7 @@ #include "ze_device.hpp" #include "ze_common.hpp" +#include "zex_common.h" #include #include @@ -76,6 +77,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0); bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0); bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0); + bool supports_cb_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0); bool supports_cp_offload = supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0); bool supports_dp_properties = @@ -175,6 +177,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_intel_subgroups_char = true; info.supports_intel_required_subgroup_size = true; info.supports_cp_offload = supports_cp_offload; + info.supports_cb_events = supports_cb_events; info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0; info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp new file mode 100644 index 00000000000000..0d16bfeb9d2280 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" + +namespace cldnn { +namespace ze { + + +// Event that does not have underlying Level Zero event object. +// It is always in signaled state. +struct ze_empty_event : public ze_base_event { +public: + ze_empty_event(uint64_t queue_stamp) + : ze_base_event(queue_stamp) { } + + void wait_impl() override { } + void set_impl() override { } + bool is_set_impl() override { return true; } + ze_event_handle_t get_handle() const override { return nullptr; } + std::optional query_timestamp() override { return std::nullopt; } + bool get_profiling_info_impl(std::list& info) override { + return true; + } +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp index d7df10b74371c7..35069b0692904f 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -12,197 +12,59 @@ using namespace cldnn; using namespace ze; -namespace { - -std::chrono::nanoseconds timestamp_to_duration(const device_info& device_info, const ze_kernel_timestamp_data_t& timestamp) { - constexpr double NS_IN_SEC = 1000000000.0; - const double timestamp_freq = NS_IN_SEC / device_info.timer_resolution; - const uint64_t timestamp_max_value = ~(-1L << device_info.kernel_timestamp_valid_bits); - - auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ? (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq - : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq; - - return std::chrono::nanoseconds(static_cast(d)); +void ze_event::reset() { + event::reset(); + ZE_CHECK(zeEventHostReset(m_event)); } -} // namespace - void ze_event::wait_impl() { - if (m_event != nullptr) { - ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout)); - } + ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout)); } void ze_event::set_impl() { - if (m_event != nullptr) { - if (m_event_manager != nullptr) { - zeCommandListAppendSignalEvent(m_event_manager->get_cmd_list(), m_event); - } else { - ZE_CHECK(zeEventHostSignal(m_event));// Does not work with cb events - } - } + ZE_CHECK(zeEventHostSignal(m_event)); } bool ze_event::is_set_impl() { - if (m_event != nullptr) { - return zeEventQueryStatus(m_event) == ZE_RESULT_SUCCESS; + auto ret = zeEventQueryStatus(m_event); + switch (ret) { + case ZE_RESULT_SUCCESS: + return true; + break; + case ZE_RESULT_NOT_READY: + return false; + break; + default: + OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret)); + break; } - return true; } -bool ze_event::is_profiled() const { - if (m_event_manager) { - return m_event_manager->is_profiling_enabled(); +std::optional ze_event::query_timestamp() { + if (!m_factory.is_profiling_enabled()) { + return std::nullopt; } - return false; -} - -bool ze_event::get_profiling_info_impl(std::list& info) { - if (!is_profiled()) { - return true; - } - - const auto& engine = m_event_manager->get_engine(); - auto device_info = engine.get_device_info(); - ze_kernel_timestamp_result_t timestamp{}; ZE_CHECK(zeEventQueryKernelTimestamp(m_event, ×tamp)); - - auto wallclock_time = timestamp_to_duration(device_info, timestamp.global); - auto exec_time = timestamp_to_duration(device_info, timestamp.context); - - auto period_exec = std::make_shared(timestamp_to_duration(device_info, timestamp.context)); - auto period_submit = std::make_shared(wallclock_time - exec_time); - - info.push_back({ instrumentation::profiling_stage::executing, period_exec }); - info.push_back({ instrumentation::profiling_stage::submission, period_submit }); - - return true; + return timestamp; } -void ze_events::wait_impl() { - if (_last_ze_event != nullptr) { - ZE_CHECK(zeEventHostSynchronize(_last_ze_event, UINT32_MAX)); - } +ze_event_handle_t ze_event::get_handle() const { + return m_event; } -void ze_events::set_impl() { - wait_impl(); -} - -bool ze_events::is_set_impl() { - if (_last_ze_event != nullptr) { - return zeEventQueryStatus(_last_ze_event) == ZE_RESULT_SUCCESS; - } - return true; -} - -bool ze_events::get_profiling_info_impl(std::list& info) { - // The goal is to sum up all disjoint durations of its projection on the time axis - std::vector all_global_timestamps; - std::vector all_context_timestamps; - - auto add_or_merge = [](std::vector& all_timestamps, const ze_kernel_timestamp_data_t& ts) { - auto it = all_timestamps.begin(); - bool merged = false; - auto target_timestamp = ts; - while (it != all_timestamps.end()) { - auto& timestamp = *it; - bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd; - bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart; - if (!disjoint) { - if (equal) { - if (!merged) { - merged = true; - break; - } else { - it = all_timestamps.erase(it); - } - } else { - if (!merged) { - timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart); - timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd); - target_timestamp = timestamp; - merged = true; - it++; - } else { - if (timestamp.kernelEnd > target_timestamp.kernelEnd) { - it--; - it->kernelEnd = target_timestamp.kernelEnd; - it++; - } - it = all_timestamps.erase(it); - } - } - } else { - it++; - } - } - - if (!merged) { - all_timestamps.push_back(target_timestamp); - } - }; - - if (_events.empty()) - return false; - - const auto& engine = downcast(_events.front().get())->m_event_manager->get_engine(); - auto device_info = engine.get_device_info(); - - auto get_total_exec_time = [&device_info](std::vector& all_timestamps) { - std::chrono::nanoseconds total_time{0}; - for (const auto& ts : all_timestamps) { - total_time += timestamp_to_duration(device_info, ts); - } - - return total_time; - }; - - // Submission time is calculated as difference between merged context and wallclock intervals - // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals - using intervals_t = std::vector; - auto get_submission_time = [&device_info](const intervals_t& s_timestamps, - const intervals_t& e_timestamps) { - auto get_minmax = [](const intervals_t& timestamps) { - uint64_t min_val = std::min(timestamps.begin(), timestamps.end(), - [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { - return lhs->kernelStart < rhs->kernelStart; - })->kernelStart; - uint64_t max_val = std::max(timestamps.begin(), timestamps.end(), - [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { - return lhs->kernelEnd < rhs->kernelEnd; - })->kernelEnd; - - return ze_kernel_timestamp_data_t{min_val, max_val}; - }; - - auto submission_interval = get_minmax(s_timestamps); - auto exec_interval = get_minmax(e_timestamps); - - auto wallclock_time = timestamp_to_duration(device_info, submission_interval); - auto exec_time = timestamp_to_duration(device_info, exec_interval); - - return wallclock_time - exec_time; - }; - - for (size_t i = 0; i < _events.size(); i++) { - auto be = downcast(_events[i].get()); - if (!be->is_profiled()) { - continue; - } - ze_kernel_timestamp_result_t timestamp{}; - ZE_CHECK(zeEventQueryKernelTimestamp(be->get(), ×tamp)); - - add_or_merge(all_global_timestamps, timestamp.global); - add_or_merge(all_context_timestamps, timestamp.context); +bool ze_event::get_profiling_info_impl(std::list& info) { + auto opt_timestamp = query_timestamp(); + if (!opt_timestamp.has_value()) { + return true; } + ze_kernel_timestamp_result_t timestamp = opt_timestamp.value(); + auto &dev_info = m_factory.get_engine().get_device_info(); + auto wallclock_time = timestamp_to_duration(dev_info, timestamp.global); + auto exec_time = timestamp_to_duration(dev_info, timestamp.context); - auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps); - auto exec_time = get_total_exec_time(all_context_timestamps); - - auto period_exec = std::make_shared(exec_time); - auto period_submit = std::make_shared(submit_time); + auto period_exec = std::make_shared(timestamp_to_duration(dev_info, timestamp.context)); + auto period_submit = std::make_shared(wallclock_time - exec_time); info.push_back({ instrumentation::profiling_stage::executing, period_exec }); info.push_back({ instrumentation::profiling_stage::submission, period_submit }); @@ -211,6 +73,5 @@ bool ze_events::get_profiling_info_impl(std::listdestroy_event(this); + ZE_WARN(zeEventDestroy(m_event)); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp index b10d2808698775..4c7e02ed32ee08 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp @@ -5,92 +5,44 @@ #pragma once #include "ze_base_event.hpp" -#include "ze_event_manager.hpp" #include "ze_event_pool.hpp" -#include -#include - namespace cldnn { namespace ze { +// L0 event. Can be either in signaled state or not signaled state. struct ze_event : public ze_base_event { public: - ze_event(ze_event_manager *ev_manager, ze_event_handle_t ev, uint64_t queue_stamp = 0, std::shared_ptr event_pool = nullptr) + // Take ownership of event handle + ze_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev, std::shared_ptr event_pool) : ze_base_event(queue_stamp) - , m_event_manager(ev_manager) , m_event_pool(event_pool) - , m_event(ev) {} - - ze_event_handle_t get() override { return m_event; } - bool is_profiled() const; - + , m_factory(factory) + , m_event(ev) { + // Ensure event handle is not null + if (ev == nullptr) { + OPENVINO_THROW("[GPU] Trying to create event with null handle"); + } + } + ze_event(const ze_event &) = delete; + ze_event& operator=(const ze_event &) = delete; ~ze_event(); + void reset() override; -private: - void wait_impl() override; - void set_impl() override; - bool is_set_impl() override; + std::optional query_timestamp() override; + ze_event_handle_t get_handle() const override; bool get_profiling_info_impl(std::list& info) override; - friend struct ze_events; - protected: - ze_event_manager *m_event_manager; - std::shared_ptr m_event_pool = nullptr; - ze_event_handle_t m_event; -}; - -struct ze_events : public ze_base_event { -public: - ze_events(std::vector const& ev) - : ze_base_event(0) { - process_events(ev); - } - - ze_event_handle_t get() override { return _last_ze_event; } - - void reset() override { - event::reset(); - _events.clear(); - } - -private: void wait_impl() override; void set_impl() override; bool is_set_impl() override; + // TODO: Implement add_event_handler_impl + // bool add_event_handler_impl(event_handler, void*) override; - void process_events(const std::vector& ev) { - for (size_t i = 0; i < ev.size(); i++) { - auto multiple_events = dynamic_cast(ev[i].get()); - if (multiple_events) { - for (size_t j = 0; j < multiple_events->_events.size(); j++) { - if (auto base_ev = dynamic_cast(multiple_events->_events[j].get())) { - auto current_ev_queue_stamp = base_ev->get_queue_stamp(); - if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) { - _queue_stamp = current_ev_queue_stamp; - _last_ze_event = base_ev->get(); - } - } - _events.push_back(multiple_events->_events[j]); - } - } else { - if (auto base_ev = dynamic_cast(ev[i].get())) { - auto current_ev_queue_stamp = base_ev->get_queue_stamp(); - if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) { - _queue_stamp = current_ev_queue_stamp; - _last_ze_event = base_ev->get(); - } - } - _events.push_back(ev[i]); - } - } - } - - bool get_profiling_info_impl(std::list& info) override; - - ze_event_handle_t _last_ze_event; - std::vector _events; + std::shared_ptr m_event_pool; + const ze_base_event_factory& m_factory; + ze_event_handle_t m_event; }; } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp similarity index 52% rename from src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp rename to src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp index a437eb69d54c2a..6ec7fc4e8ab3c4 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ze_event_pool_manager.hpp" +#include "ze_event_factory.hpp" #include "ze_common.hpp" #include "ze_event.hpp" @@ -11,18 +11,16 @@ using namespace cldnn; using namespace ze; -ze_event_pool_manager::ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity) -: ze_event_manager(engine, cmd_list, enable_profiling) +ze_event_factory::ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity) +: ze_base_event_factory(engine, enable_profiling) , m_current_pool(nullptr) , m_capacity(capacity) -, m_num_used(0) {} +, m_num_used(0) { } -ze_event_pool_manager::~ze_event_pool_manager() {} - -std::shared_ptr ze_event_pool_manager::create_event(uint64_t queue_stamp) { +event::ptr ze_event_factory::create_event(uint64_t queue_stamp) { if (m_num_used >= m_capacity || !m_current_pool) { m_num_used = 0; - ze_event_pool_flags_t flags = m_enable_profiling ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; + ze_event_pool_flags_t flags = is_profiling_enabled() ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; m_current_pool = std::make_shared(m_engine, m_capacity, flags); } @@ -37,9 +35,5 @@ std::shared_ptr ze_event_pool_manager::create_event(uint64_t queue_sta }; ZE_CHECK(zeEventCreate(m_current_pool->m_handle, &event_desc, &event)); - return std::make_shared(this, event, queue_stamp, m_current_pool); -} - -void ze_event_pool_manager::destroy_event(ze_event *event) { - zeEventDestroy(event->get()); + return std::make_shared(queue_stamp, *this, event, m_current_pool); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp new file mode 100644 index 00000000000000..9ed0244446a6ee --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event_factory.hpp" +#include "ze_event_pool.hpp" + +namespace cldnn { +namespace ze { + +// Interface for creating l0 events using event pools +struct ze_event_factory : public ze_base_event_factory { +public: + ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255); + event::ptr create_event(uint64_t queue_stamp) override; +protected: + std::shared_ptr m_current_pool; + const uint32_t m_capacity; + uint32_t m_num_used; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp deleted file mode 100644 index c852fc295b885c..00000000000000 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_manager.hpp +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "ze_engine.hpp" - -namespace cldnn { -namespace ze { - -struct ze_event; - -// Interface for creating and destroying Level Zero events -struct ze_event_manager { -public: - using ptr = std::shared_ptr; - ze_event_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling) - : m_engine(engine), m_cmd_list(cmd_list), m_enable_profiling(enable_profiling) {} - const ze_engine& get_engine() const { return m_engine; } - bool is_profiling_enabled() const { return m_enable_profiling; } - ze_command_list_handle_t get_cmd_list() { return m_cmd_list; } - - virtual ~ze_event_manager() {} - virtual std::shared_ptr create_event(uint64_t queue_stamp) = 0; - virtual void destroy_event(ze_event *event) = 0; -protected: - const ze_engine& m_engine; - ze_command_list_handle_t m_cmd_list; - bool m_enable_profiling; -}; -} // namespace ze -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp index cec1448cf8be8d..98d5f485b645cb 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp @@ -9,14 +9,13 @@ namespace cldnn { namespace ze { -struct ze_event; - -// Wrapper for ze events pool which is needed to track lifetime of the pool. -// I.e. the object is destoyed if no ze_events alive which refer to this pool -// and ze_events_pool doesn't refer to it as well +// RAII wrapper for Level Zero event pool struct ze_event_pool { ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags); ~ze_event_pool(); + ze_event_pool(const ze_event_pool&) = delete; + ze_event_pool& operator=(const ze_event_pool&) = delete; + using ptr = std::shared_ptr; ze_event_pool_handle_t m_handle; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp deleted file mode 100644 index 1cac8585e2ea19..00000000000000 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool_manager.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ze_event_manager.hpp" -#include "ze_event_pool.hpp" - -namespace cldnn { -namespace ze { - -// Interface for creating and destroying l0 events using event pools -struct ze_event_pool_manager : public ze_event_manager { -public: - ze_event_pool_manager(const ze_engine &engine, ze_command_list_handle_t cmd_list, bool enable_profiling, uint32_t capacity = 255); - ~ze_event_pool_manager(); - std::shared_ptr create_event(uint64_t queue_stamp) override; - void destroy_event(ze_event *event) override; -protected: - std::shared_ptr m_current_pool; - const uint32_t m_capacity; - uint32_t m_num_used; -}; -} // namespace ze -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp new file mode 100644 index 00000000000000..1ac2d04a69757d --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp @@ -0,0 +1,155 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_events.hpp" +#include "ze_common.hpp" + +#include +#include +#include + +using namespace cldnn; +using namespace ze; + +void ze_events::wait_impl() { + if (m_last_event) { + ZE_CHECK(zeEventHostSynchronize(m_last_event, default_timeout)); + } +} + +void ze_events::set_impl() { + // Call wait_impl to be in line with ocl_events + wait_impl(); +} + +bool ze_events::is_set_impl() { + if (!m_last_event) { + return true; + } + + auto ret = zeEventQueryStatus(m_last_event); + switch (ret) { + case ZE_RESULT_SUCCESS: + return true; + break; + case ZE_RESULT_NOT_READY: + return false; + break; + default: + OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret)); + break; + } +} + +bool ze_events::get_profiling_info_impl(std::list& info) { + // The goal is to sum up all disjoint durations of its projection on the time axis + std::vector all_global_timestamps; + std::vector all_context_timestamps; + + auto add_or_merge = [](std::vector& all_timestamps, const ze_kernel_timestamp_data_t& ts) { + auto it = all_timestamps.begin(); + bool merged = false; + auto target_timestamp = ts; + while (it != all_timestamps.end()) { + auto& timestamp = *it; + bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd; + bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart; + if (!disjoint) { + if (equal) { + if (!merged) { + merged = true; + break; + } else { + it = all_timestamps.erase(it); + } + } else { + if (!merged) { + timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart); + timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd); + target_timestamp = timestamp; + merged = true; + it++; + } else { + if (timestamp.kernelEnd > target_timestamp.kernelEnd) { + it--; + it->kernelEnd = target_timestamp.kernelEnd; + it++; + } + it = all_timestamps.erase(it); + } + } + } else { + it++; + } + } + + if (!merged) { + all_timestamps.push_back(target_timestamp); + } + }; + + if (m_events.empty()) + return false; + + auto device_info = m_engine.get_device_info(); + + auto get_total_exec_time = [&device_info](std::vector& all_timestamps) { + std::chrono::nanoseconds total_time{0}; + for (const auto& ts : all_timestamps) { + total_time += timestamp_to_duration(device_info, ts); + } + + return total_time; + }; + + // Submission time is calculated as difference between merged context and wallclock intervals + // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals + using intervals_t = std::vector; + auto get_submission_time = [&device_info](const intervals_t& s_timestamps, + const intervals_t& e_timestamps) { + auto get_minmax = [](const intervals_t& timestamps) { + uint64_t min_val = std::min(timestamps.begin(), timestamps.end(), + [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { + return lhs->kernelStart < rhs->kernelStart; + })->kernelStart; + uint64_t max_val = std::max(timestamps.begin(), timestamps.end(), + [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { + return lhs->kernelEnd < rhs->kernelEnd; + })->kernelEnd; + + return ze_kernel_timestamp_data_t{min_val, max_val}; + }; + + auto submission_interval = get_minmax(s_timestamps); + auto exec_interval = get_minmax(e_timestamps); + + auto wallclock_time = timestamp_to_duration(device_info, submission_interval); + auto exec_time = timestamp_to_duration(device_info, exec_interval); + + return wallclock_time - exec_time; + }; + + for (size_t i = 0; i < m_events.size(); i++) { + auto be = downcast(m_events[i].get()); + auto opt_timestamp = be->query_timestamp(); + if (!opt_timestamp.has_value()) { + continue; + } + ze_kernel_timestamp_result_t timestamp = opt_timestamp.value(); + + add_or_merge(all_global_timestamps, timestamp.global); + add_or_merge(all_context_timestamps, timestamp.context); + } + + auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps); + auto exec_time = get_total_exec_time(all_context_timestamps); + + auto period_exec = std::make_shared(exec_time); + auto period_submit = std::make_shared(submit_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp new file mode 100644 index 00000000000000..680753d6409084 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" +#include "ze_engine.hpp" + +#include + +namespace cldnn { +namespace ze { + +struct ze_events : public ze_base_event { +public: + ze_events(std::vector const& ev, const ze_engine &engine) + : ze_base_event(0) + , m_engine(engine) { + process_events(ev); + } + + void reset() override { + event::reset(); + m_events.clear(); + } + + std::optional query_timestamp() override { return std::nullopt; } + ze_event_handle_t get_handle() const { return m_last_event; } + bool get_profiling_info_impl(std::list& info) override; + +protected: + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + + void process_events(const std::vector& ev) { + for (size_t i = 0; i < ev.size(); i++) { + auto multiple_events = dynamic_cast(ev[i].get()); + if (multiple_events) { + for (size_t j = 0; j < multiple_events->m_events.size(); j++) { + if (auto base_ev = dynamic_cast(multiple_events->m_events[j].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) { + m_queue_stamp = current_ev_queue_stamp; + m_last_event = base_ev->get_handle(); + } + } + m_events.push_back(multiple_events->m_events[j]); + } + } else { + if (auto base_ev = dynamic_cast(ev[i].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) { + m_queue_stamp = current_ev_queue_stamp; + m_last_event = base_ev->get_handle(); + } + } + m_events.push_back(ev[i]); + } + } + } + + ze_event_handle_t m_last_event = nullptr; + std::vector m_events; + const ze_engine &m_engine; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 218c3390a5c566..55903ad3104b48 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -31,7 +31,7 @@ std::vector get_ze_events(const std::vector& even std::vector ze_events; ze_events.reserve(events.size()); for (const auto& ev : events) { - auto ze_event = downcast(ev.get())->get(); + auto ze_event = downcast(ev.get())->get_handle(); if (ze_event != nullptr) { ze_events.push_back(ze_event); } @@ -142,7 +142,7 @@ void gpu_usm::unlock(const stream& /* stream */) { event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vector& dep_events, bool blocking) { auto& _ze_stream = downcast(stream); auto ev = _ze_stream.create_base_event(); - auto ev_ze = downcast(ev.get())->get(); + auto ev_ze = downcast(ev.get())->get_handle(); std::vector temp_buffer(_bytes_count, pattern); auto ze_dep_events = get_ze_events(dep_events); ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), @@ -170,7 +170,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o return result_event; auto _ze_stream = downcast(&stream); - auto _ze_event = downcast(result_event.get())->get(); + auto _ze_event = downcast(result_event.get())->get_handle(); auto src_ptr = reinterpret_cast(data_ptr) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; @@ -195,7 +195,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_ return result_event; auto _ze_stream = downcast(&stream); - auto _ze_event = downcast(result_event.get())->get(); + auto _ze_event = downcast(result_event.get())->get_handle(); OPENVINO_ASSERT(memory_capabilities::is_usm_type(src_mem.get_allocation_type())); auto usm_mem = downcast(&src_mem); @@ -222,7 +222,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s return result_event; auto _ze_stream = downcast(&stream); - auto _ze_event = downcast(result_event.get())->get(); + auto _ze_event = downcast(result_event.get())->get_handle(); auto src_ptr = reinterpret_cast(buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(data_ptr) + dst_offset; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 374f5dbb342985..76968653ea7e81 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -9,9 +9,10 @@ #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/properties.hpp" -#include "ze_event_manager.hpp" -#include "ze_cb_event_manager.hpp" -#include "ze_event_pool_manager.hpp" +#include "ze_cb_event_factory.hpp" +#include "ze_event_factory.hpp" +#include "ze_events.hpp" +#include "ze_empty_event.hpp" #include "ze_event.hpp" #include "ze_kernel.hpp" @@ -204,16 +205,18 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); command_queue_desc.ordinal = info.copy_queue_group_ordinal; ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list)); - if (m_queue_type == QueueTypes::in_order) { - m_ev_manager = std::make_unique(engine, m_command_list, config.get_enable_profiling()); + if (false/*m_queue_type == QueueTypes::in_order && info.supports_cb_events*/) { + m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } else { - m_ev_manager = std::make_unique(engine, m_command_list, config.get_enable_profiling()); + m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } } ze_stream::~ze_stream() { +#ifdef ENABLE_ONEDNN_FOR_GPU // Destroy OneDNN stream before destroying command list _onednn_stream.reset(); +#endif if (m_command_list != nullptr) zeCommandListDestroy(m_command_list); if (m_copy_command_list != nullptr) @@ -243,8 +246,8 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel, if (m_sync_method == SyncMethods::events) { for (auto& dep : deps) { if (auto ze_base_ev = std::dynamic_pointer_cast(dep)) { - if (ze_base_ev->get() != nullptr) - dep_events.push_back(ze_base_ev->get()); + if (ze_base_ev->get_handle() != nullptr) + dep_events.push_back(ze_base_ev->get_handle()); } } dep_events_ptr = &dep_events; @@ -253,7 +256,7 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel, } bool set_output_event = m_sync_method == SyncMethods::events || is_output; - auto ev = set_output_event ? create_base_event() : std::make_shared(nullptr, nullptr, ++m_queue_counter); + auto ev = set_output_event ? create_base_event() : std::make_shared(++m_queue_counter); auto global = to_group_count(args_desc.workGroups.global); auto local = to_group_count(args_desc.workGroups.local); ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ }; @@ -261,7 +264,7 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel, ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list, kern, &args, - set_output_event ? std::dynamic_pointer_cast(ev)->get() : nullptr, + set_output_event ? std::dynamic_pointer_cast(ev)->get_handle() : nullptr, dep_events_ptr == nullptr ? 0 : static_cast(dep_events_ptr->size()), dep_events_ptr == nullptr ? 0 : &dep_events_ptr->front())); @@ -275,7 +278,7 @@ void ze_stream::enqueue_barrier() { event::ptr ze_stream::enqueue_marker(std::vector const& deps, bool is_output) { if (deps.empty()) { auto ev = create_base_event(); - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get(), 0, nullptr)); + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get_handle(), 0, nullptr)); return ev; } @@ -283,8 +286,8 @@ event::ptr ze_stream::enqueue_marker(std::vector const& deps, boo std::vector dep_events; for (auto& dep : deps) { if (auto ze_base_ev = std::dynamic_pointer_cast(dep)) { - if (ze_base_ev->get() != nullptr) - dep_events.push_back(ze_base_ev->get()); + if (ze_base_ev->get_handle() != nullptr) + dep_events.push_back(ze_base_ev->get_handle()); } } if (dep_events.empty()) @@ -292,7 +295,7 @@ event::ptr ze_stream::enqueue_marker(std::vector const& deps, boo auto ev = create_base_event(); ZE_CHECK(zeCommandListAppendBarrier(m_command_list, - std::dynamic_pointer_cast(ev)->get(), + std::dynamic_pointer_cast(ev)->get_handle(), static_cast(dep_events.size()), &dep_events.front())); return ev; @@ -305,8 +308,8 @@ event::ptr ze_stream::enqueue_marker(std::vector const& deps, boo } } -ze_event::ptr ze_stream::group_events(std::vector const& deps) { - return std::make_shared(deps); +ze_event::ptr ze_stream::group_events(std::vector const& deps) { + return std::make_shared(deps, _engine); } void ze_stream::wait() { @@ -314,8 +317,7 @@ void ze_stream::wait() { } event::ptr ze_stream::create_user_event(bool set) { - // user event should use different api - auto ev = m_ev_manager->create_event(++m_queue_counter); + auto ev = m_ev_factory->create_event(++m_queue_counter); if (set) ev->set(); @@ -323,7 +325,7 @@ event::ptr ze_stream::create_user_event(bool set) { } event::ptr ze_stream::create_base_event() { - return m_ev_manager->create_event(++m_queue_counter); + return m_ev_factory->create_event(++m_queue_counter); } void ze_stream::flush() const { @@ -338,7 +340,7 @@ void ze_stream::wait_for_events(const std::vector& events) { bool needs_sync = false; for (auto& ev : events) { auto* ze_base_ev = dynamic_cast(ev.get()); - if (ze_base_ev->get() != nullptr) { + if (ze_base_ev->get_handle() != nullptr) { ze_base_ev->wait(); } else { needs_sync = true; @@ -366,7 +368,7 @@ void ze_stream::sync_events(std::vector const& deps, bool is_output) if (is_output) { m_last_barrier_ev = std::dynamic_pointer_cast(create_base_event()); m_last_barrier_ev->set_queue_stamp(m_queue_counter.load()); - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get(), 0, nullptr)); + ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr)); } else { ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp index 2888ab696cec0d..68e481c5bf3018 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -9,7 +9,7 @@ #include "ze_common.hpp" #include "ze_engine.hpp" #include "ze_event.hpp" -#include "ze_event_manager.hpp" +#include "ze_base_event_factory.hpp" namespace cldnn { namespace ze { @@ -28,7 +28,7 @@ class ze_stream : public stream { , m_queue_counter(other.m_queue_counter.load()) , m_last_barrier(other.m_last_barrier.load()) , m_last_barrier_ev(other.m_last_barrier_ev) - , m_ev_manager(other.m_ev_manager.release()) { + , m_ev_factory(other.m_ev_factory.release()) { other.m_command_list = nullptr; other.m_copy_command_list = nullptr; } @@ -65,7 +65,7 @@ class ze_stream : public stream { mutable std::atomic m_queue_counter{0}; std::atomic m_last_barrier{0}; std::shared_ptr m_last_barrier_ev = nullptr; - std::unique_ptr m_ev_manager; + std::unique_ptr m_ev_factory; #ifdef ENABLE_ONEDNN_FOR_GPU std::shared_ptr _onednn_stream = nullptr; From e8dce1572c7b5fedd5d4b9ce3f6dc5f08a82729d Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 9 Oct 2025 12:05:12 +0000 Subject: [PATCH 28/74] Enable cb events for in-order queue type --- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 76968653ea7e81..1194ef8565781b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -205,7 +205,7 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); command_queue_desc.ordinal = info.copy_queue_group_ordinal; ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list)); - if (false/*m_queue_type == QueueTypes::in_order && info.supports_cb_events*/) { + if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } else { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); From ae161777576be8bc3dc35212cc22724cbc380428 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 9 Oct 2025 15:41:15 +0000 Subject: [PATCH 29/74] Restore pooling impl and convolution ref impl --- src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp | 5 ++--- .../kernels/convolution/convolution_kernel_ref.cpp | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp index 81c4c27189cb83..8a308d505cf4b1 100644 --- a/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp +++ b/src/plugins/intel_gpu/src/graph/registry/pooling_impls.cpp @@ -16,8 +16,7 @@ using namespace cldnn; const std::vector>& Registry::get_implementations() { static const std::vector> impls = { - //FIXME: Disable for now as there is some issue when creating OneDNN descriptor - returns unimplemented - /*OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) { + OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::PoolingImplementationManager, shape_types::static_shape, [](const program_node& node) { const auto& in_layout = node.get_input_layout(0); const auto& out_layout = node.get_output_layout(0); // Disable this case due to sporadic hang for the following case: @@ -28,7 +27,7 @@ const std::vector>& Registry Date: Fri, 10 Oct 2025 10:05:22 +0000 Subject: [PATCH 30/74] Fix OneDNN include paths --- src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index 0f0a77d83b24e1..0faac9003405b6 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -170,7 +170,6 @@ if(ENABLE_ONEDNN_FOR_GPU) "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen" "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config" "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include" - "${ONEDNN_GPU_DIR}/third_party" "${ONEDNN_GPU_DIR}/third_party/ngen") set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE} From 0e7c3638e653e67460727e69fab5f8ef08e9f0a3 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 10 Oct 2025 13:34:28 +0000 Subject: [PATCH 31/74] Move compute runtime headers to separate target --- src/plugins/intel_gpu/cmake/utils.cmake | 2 +- thirdparty/level_zero/CMakeLists.txt | 5 ++++- thirdparty/level_zero/{ => compute-runtime}/ze_intel_gpu.h | 0 thirdparty/level_zero/{ => compute-runtime}/ze_stypes.h | 0 thirdparty/level_zero/{ => compute-runtime}/zex_common.h | 0 thirdparty/level_zero/{ => compute-runtime}/zex_event.h | 0 6 files changed, 5 insertions(+), 2 deletions(-) rename thirdparty/level_zero/{ => compute-runtime}/ze_intel_gpu.h (100%) rename thirdparty/level_zero/{ => compute-runtime}/ze_stypes.h (100%) rename thirdparty/level_zero/{ => compute-runtime}/zex_common.h (100%) rename thirdparty/level_zero/{ => compute-runtime}/zex_event.h (100%) diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake index 5b61368a5d4742..e407fd9b3f04fb 100644 --- a/src/plugins/intel_gpu/cmake/utils.cmake +++ b/src/plugins/intel_gpu/cmake/utils.cmake @@ -5,7 +5,7 @@ function(ov_gpu_set_runtime_interface_for TARGET_NAME) if(GPU_RT_TYPE STREQUAL "L0") target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1) - target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero) + target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero ze_compute_runtime_headers) elseif(GPU_RT_TYPE STREQUAL "OCL") target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1) # Do not link OpenCL as It is already linked to the targets that require it diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt index 87b76c339a5233..416e7f48005400 100644 --- a/thirdparty/level_zero/CMakeLists.txt +++ b/thirdparty/level_zero/CMakeLists.txt @@ -30,7 +30,10 @@ endif() set(CMAKE_COMPILE_WARNING_AS_ERROR OFF) add_subdirectory(level-zero EXCLUDE_FROM_ALL) -set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $) +set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $) + +add_library(ze_compute_runtime_headers INTERFACE) +target_include_directories(ze_compute_runtime_headers INTERFACE $) # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it file(REMOVE "${CMAKE_BINARY_DIR}/VERSION") diff --git a/thirdparty/level_zero/ze_intel_gpu.h b/thirdparty/level_zero/compute-runtime/ze_intel_gpu.h similarity index 100% rename from thirdparty/level_zero/ze_intel_gpu.h rename to thirdparty/level_zero/compute-runtime/ze_intel_gpu.h diff --git a/thirdparty/level_zero/ze_stypes.h b/thirdparty/level_zero/compute-runtime/ze_stypes.h similarity index 100% rename from thirdparty/level_zero/ze_stypes.h rename to thirdparty/level_zero/compute-runtime/ze_stypes.h diff --git a/thirdparty/level_zero/zex_common.h b/thirdparty/level_zero/compute-runtime/zex_common.h similarity index 100% rename from thirdparty/level_zero/zex_common.h rename to thirdparty/level_zero/compute-runtime/zex_common.h diff --git a/thirdparty/level_zero/zex_event.h b/thirdparty/level_zero/compute-runtime/zex_event.h similarity index 100% rename from thirdparty/level_zero/zex_event.h rename to thirdparty/level_zero/compute-runtime/zex_event.h From 7b5829fe93e64a3d89ae1c27bb74d237afc5ef1f Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 13 Oct 2025 15:27:14 +0000 Subject: [PATCH 32/74] Remove redundant DNNL macro --- src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index 0faac9003405b6..d876d52f97baed 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -160,11 +160,6 @@ if(ENABLE_ONEDNN_FOR_GPU) DEPENDEES install # Ensures this runs after install ) endif() - if(GPU_RUNTIME STREQUAL "L0") - set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_L0) - elseif(GPU_RUNTIME STREQUAL "OCL") - set(DNNL_GPU_RUNTIME_VALUE DNNL_RUNTIME_OCL) - endif() set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include" "${ONEDNN_GPU_DIR}/src" "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen" @@ -172,7 +167,6 @@ if(ENABLE_ONEDNN_FOR_GPU) "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include" "${ONEDNN_GPU_DIR}/third_party/ngen") set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU - DNNL_GPU_RUNTIME={$DNNL_GPU_RUNTIME_VALUE} DNNL_DLL DNNL_DLL_EXPORTS DNNL_ENABLE_CPU_ISA_HINTS From debf47f265e0e44ede1b988fce55be80baeee02b Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 15 Oct 2025 11:31:26 +0000 Subject: [PATCH 33/74] Add copy offload --- .../include/intel_gpu/runtime/device_info.hpp | 1 - src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp | 5 ----- src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp | 8 ++++---- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 12 ++++++++---- src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp | 4 ---- 5 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 48ca01cb64f1cf..bf6388e9730898 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -147,7 +147,6 @@ struct device_info { uint64_t timer_resolution; ///< [L0] Resolution of device timer used for profiling in cycles/sec uint32_t kernel_timestamp_valid_bits; ///< [L0] Number of valid bits in the kernel timestamp values uint32_t compute_queue_group_ordinal; ///< [L0] Ordinal of the command queue group to use for compute - uint32_t copy_queue_group_ordinal; ///< [L0] Ordinal of the command queue group to use for copy uint32_t device_memory_ordinal; ///< [L0] Ordinal of the selected global device memory ov::device::UUID uuid; ///< UUID of the gpu device diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index 0300ba9ff93f5b..d889371b8ecfb1 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -103,12 +103,8 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0; }); - auto copy_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { - return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0 && (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0; - }); OPENVINO_ASSERT(compute_queue_props != queue_properties.end()); - OPENVINO_ASSERT(copy_queue_props != queue_properties.end()); uint32_t memory_properties_count = 0; ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr)); @@ -200,7 +196,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.kernel_timestamp_valid_bits = device_properties.kernelTimestampValidBits; info.timer_resolution = device_properties.timerResolution; info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props); - info.copy_queue_group_ordinal = std::distance(queue_properties.begin(), copy_queue_props); static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, ""); static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, ""); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 55903ad3104b48..3411c5489361a8 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -111,7 +111,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re } GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl; _host_buffer.allocateHost(_bytes_count); - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_copy_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, @@ -174,7 +174,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o auto src_ptr = reinterpret_cast(data_ptr) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), dst_ptr, src_ptr, _bytes_count, @@ -202,7 +202,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_ auto src_ptr = reinterpret_cast(usm_mem->buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), dst_ptr, src_ptr, _bytes_count, @@ -226,7 +226,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s auto src_ptr = reinterpret_cast(buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(data_ptr) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_copy_queue(), + ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), dst_ptr, src_ptr, _bytes_count, diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 1194ef8565781b..aa3f90dbf169dd 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -202,9 +202,15 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; + zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {}; + cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES; + cp_offload_desc.copyOffloadEnabled = true; + cp_offload_desc.pNext = nullptr; + if (info.supports_cp_offload) { + command_queue_desc.pNext = &cp_offload_desc; + } + ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); - command_queue_desc.ordinal = info.copy_queue_group_ordinal; - ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_copy_command_list)); if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } else { @@ -219,8 +225,6 @@ ze_stream::~ze_stream() { #endif if (m_command_list != nullptr) zeCommandListDestroy(m_command_list); - if (m_copy_command_list != nullptr) - zeCommandListDestroy(m_copy_command_list); } void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp index 68e481c5bf3018..03e9c12a578a9a 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -17,20 +17,17 @@ namespace ze { class ze_stream : public stream { public: ze_command_list_handle_t get_queue() const { return m_command_list; } - ze_command_list_handle_t get_copy_queue() const { return m_copy_command_list; } ze_stream(const ze_engine& engine, const ExecutionConfig& config); ze_stream(ze_stream&& other) : stream(other.m_queue_type, other.m_sync_method) , _engine(other._engine) , m_command_list(other.m_command_list) - , m_copy_command_list(other.m_copy_command_list) , m_queue_counter(other.m_queue_counter.load()) , m_last_barrier(other.m_last_barrier.load()) , m_last_barrier_ev(other.m_last_barrier_ev) , m_ev_factory(other.m_ev_factory.release()) { other.m_command_list = nullptr; - other.m_copy_command_list = nullptr; } ~ze_stream(); @@ -61,7 +58,6 @@ class ze_stream : public stream { const ze_engine& _engine; mutable ze_command_list_handle_t m_command_list = 0; - mutable ze_command_list_handle_t m_copy_command_list = 0; mutable std::atomic m_queue_counter{0}; std::atomic m_last_barrier{0}; std::shared_ptr m_last_barrier_ev = nullptr; From 33a173683e8565be5a01d4a67a74edcb60c2c26f Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 15 Oct 2025 11:49:42 +0000 Subject: [PATCH 34/74] warn if copy offload is not supported --- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index aa3f90dbf169dd..08bfea5e15f343 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -208,6 +208,8 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) cp_offload_desc.pNext = nullptr; if (info.supports_cp_offload) { command_queue_desc.pNext = &cp_offload_desc; + } else { + GPU_DEBUG_INFO << "Copy offload hint is not supported" << std::endl; } ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); From 37cfa3975aaeb7f2c15ddcd1dd72b893de4b5908 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 15 Oct 2025 13:46:34 +0000 Subject: [PATCH 35/74] Copy level zero headers to build dir --- src/plugins/intel_gpu/cmake/utils.cmake | 2 +- thirdparty/level_zero/CMakeLists.txt | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake index e407fd9b3f04fb..5b61368a5d4742 100644 --- a/src/plugins/intel_gpu/cmake/utils.cmake +++ b/src/plugins/intel_gpu/cmake/utils.cmake @@ -5,7 +5,7 @@ function(ov_gpu_set_runtime_interface_for TARGET_NAME) if(GPU_RT_TYPE STREQUAL "L0") target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1) - target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero ze_compute_runtime_headers) + target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero) elseif(GPU_RT_TYPE STREQUAL "OCL") target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1) # Do not link OpenCL as It is already linked to the targets that require it diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt index 416e7f48005400..0fbbd2ad303106 100644 --- a/thirdparty/level_zero/CMakeLists.txt +++ b/thirdparty/level_zero/CMakeLists.txt @@ -30,10 +30,24 @@ endif() set(CMAKE_COMPILE_WARNING_AS_ERROR OFF) add_subdirectory(level-zero EXCLUDE_FROM_ALL) -set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $) +set(ZE_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/include/") +file(GLOB_RECURSE COMPUTE_RUNTIME_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime/*.h") +file(GLOB_RECURSE LEVEL_ZERO_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include") +add_custom_command(OUTPUT "${ZE_INCLUDE_DIR}" + COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include" "${ZE_INCLUDE_DIR}/level_zero" + COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime" "${ZE_INCLUDE_DIR}/level_zero" + DEPENDS "${COMPUTE_RUNTIME_HEADERS}" "${LEVEL_ZERO_HEADERS}" + COMMENT "Copying Level Zero and compute-runtime headers..." +) +add_custom_target(prepare_ze_headers ALL DEPENDS "${ZE_INCLUDE_DIR}") +add_dependencies(ze_loader prepare_ze_headers) + +# Allow include patterns with and without level-zero/ prefix +set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ + $ +) -add_library(ze_compute_runtime_headers INTERFACE) -target_include_directories(ze_compute_runtime_headers INTERFACE $) # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it file(REMOVE "${CMAKE_BINARY_DIR}/VERSION") From 55cb68e936c11f2dca1a5bd9b8c4e7bfc2e40639 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 17 Oct 2025 15:34:50 +0000 Subject: [PATCH 36/74] Update l0 onednn submodule --- src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu index ce0e98bf72a6c7..633a03d736a265 160000 --- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -1 +1 @@ -Subproject commit ce0e98bf72a6c79642424e4d097b09a096b3b37f +Subproject commit 633a03d736a2656eab1de8d7e7032711bdc4a30b From 4893a284dfba619a81d6f554bf5b4484c3e5e0b3 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 17 Oct 2025 15:54:19 +0000 Subject: [PATCH 37/74] Change L0 macro check names --- .../intel_gpu/src/runtime/ze/ze_cb_event.cpp | 6 ++-- .../src/runtime/ze/ze_cb_event_factory.cpp | 4 +-- .../intel_gpu/src/runtime/ze/ze_common.hpp | 6 ++-- .../intel_gpu/src/runtime/ze/ze_device.cpp | 30 +++++++++---------- .../src/runtime/ze/ze_device_detector.cpp | 16 +++++----- .../intel_gpu/src/runtime/ze/ze_event.cpp | 10 +++---- .../src/runtime/ze/ze_event_factory.cpp | 2 +- .../src/runtime/ze/ze_event_pool.cpp | 2 +- .../intel_gpu/src/runtime/ze/ze_events.cpp | 2 +- .../intel_gpu/src/runtime/ze/ze_kernel.hpp | 6 ++-- .../intel_gpu/src/runtime/ze/ze_memory.cpp | 14 ++++----- .../intel_gpu/src/runtime/ze/ze_memory.hpp | 6 ++-- .../intel_gpu/src/runtime/ze/ze_stream.cpp | 18 +++++------ 13 files changed, 62 insertions(+), 60 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp index c7f71f00a394dc..3746c92fe00bbc 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp @@ -13,7 +13,7 @@ using namespace cldnn; using namespace ze; void ze_cb_event::wait_impl() { - ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout)); + OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout)); } void ze_cb_event::set_impl() { @@ -44,7 +44,7 @@ std::optional ze_cb_event::query_timestamp() { return std::nullopt; } ze_kernel_timestamp_result_t timestamp{}; - ZE_CHECK(zeEventQueryKernelTimestamp(m_event, ×tamp)); + OV_ZE_EXPECT(zeEventQueryKernelTimestamp(m_event, ×tamp)); return timestamp; } @@ -68,5 +68,5 @@ bool ze_cb_event::get_profiling_info_impl(std::list(&func_zexCounterBasedEventCreate2))); } @@ -32,7 +32,7 @@ event::ptr ze_cb_event_factory::create_event(uint64_t queue_stamp) { if (is_profiling_enabled()) { desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP; } - ZE_CHECK(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event)); + OV_ZE_EXPECT(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event)); auto cb_event = std::make_shared(queue_stamp, *this, event); return cb_event; } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp index 36d820139be816..3961bd5776a11e 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -7,7 +7,8 @@ #include -#define ZE_CHECK(f) \ +// Expect success of level zero command, throw runtime error otherwise +#define OV_ZE_EXPECT(f) \ do { \ ze_result_t res_ = (f); \ if (res_ != ZE_RESULT_SUCCESS) { \ @@ -15,7 +16,8 @@ } \ } while (false) -#define ZE_WARN(f) \ +// Prints warning if level zero command does not return success result +#define OV_ZE_WARN(f) \ do { \ ze_result_t res_ = (f); \ if (res_ != ZE_RESULT_SUCCESS) { \ diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index d889371b8ecfb1..d3557f9f30a914 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -65,13 +65,13 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic device_info info; uint32_t num_ext = 0; - ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, nullptr)); + OV_ZE_EXPECT(zeDriverGetExtensionProperties(driver, &num_ext, nullptr)); std::vector extensions(num_ext); - ZE_CHECK(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0])); + OV_ZE_EXPECT(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0])); ze_driver_properties_t driver_properties{ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES}; - ZE_CHECK(zeDriverGetProperties(driver, &driver_properties)); + OV_ZE_EXPECT(zeDriverGetProperties(driver, &driver_properties)); bool supports_luid = supports_extension(extensions, ZE_DEVICE_LUID_EXT_NAME, ZE_DEVICE_LUID_EXT_VERSION_1_0); bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0); @@ -85,20 +85,20 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0}; ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr}; - ZE_CHECK(zeDeviceGetProperties(device, &device_properties)); + OV_ZE_EXPECT(zeDeviceGetProperties(device, &device_properties)); ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES}; - ZE_CHECK(zeDeviceGetComputeProperties(device, &device_compute_properties)); + OV_ZE_EXPECT(zeDeviceGetComputeProperties(device, &device_compute_properties)); uint32_t queue_properties_count = 0; - ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr)); + OV_ZE_EXPECT(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr)); std::vector queue_properties(queue_properties_count); for (auto& mp : queue_properties) { mp.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; } - ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0])); + OV_ZE_EXPECT(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0])); auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0; @@ -107,16 +107,16 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic OPENVINO_ASSERT(compute_queue_props != queue_properties.end()); uint32_t memory_properties_count = 0; - ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr)); + OV_ZE_EXPECT(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr)); std::vector device_memory_properties(memory_properties_count); for (auto& mp : device_memory_properties) { mp.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES; } - ZE_CHECK(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0])); + OV_ZE_EXPECT(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0])); ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES}; - ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); + OV_ZE_EXPECT(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); auto mem_properties = std::find_if(device_memory_properties.begin(), device_memory_properties.end(), [](const ze_device_memory_properties_t& p) { auto name = std::string(p.name); @@ -128,10 +128,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic if (supports_dp_properties) { device_module_properties.pNext = &dp_properties; } - ZE_CHECK(zeDeviceGetModuleProperties(device, &device_module_properties)); + OV_ZE_EXPECT(zeDeviceGetModuleProperties(device, &device_module_properties)); ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES}; - ZE_CHECK(zeDeviceGetImageProperties(device, &device_image_properties)); + OV_ZE_EXPECT(zeDeviceGetImageProperties(device, &device_image_properties)); info.vendor_id = device_properties.vendorId; info.dev_name = device_properties.name; @@ -240,7 +240,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic // Create temporary context just for OneDNN HW detection ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 }; ze_context_handle_t context; - ZE_CHECK(zeContextCreate(driver, &context_desc, &context)); + OV_ZE_EXPECT(zeContextCreate(driver, &context_desc, &context)); ngen::Product product = ngen::LevelZeroCodeGenerator::detectHWInfo(context, device); zeContextDestroy(context); info.arch = convert_ngen_arch(ngen::getCore(product.family)); @@ -259,7 +259,7 @@ memory_capabilities init_memory_caps(ze_device_handle_t device, const device_inf std::vector memory_caps; ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES}; - ZE_CHECK(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); + OV_ZE_EXPECT(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); if (info.supports_usm) { if (device_memory_access_properties.hostAllocCapabilities) { @@ -294,7 +294,7 @@ void ze_device::initialize() { return; ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 }; - ZE_CHECK(zeContextCreate(_driver, &context_desc, &_context)); + OV_ZE_EXPECT(zeContextCreate(_driver, &context_desc, &_context)); _is_initialized = true; } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp index 4fede32a322a73..a40f4a7b690185 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp @@ -16,13 +16,13 @@ namespace ze { static std::vector get_sub_devices(ze_device_handle_t root_device) { uint32_t n_subdevices = 0; - ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr)); + OV_ZE_EXPECT(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr)); if (n_subdevices == 0) return {}; std::vector subdevices(n_subdevices); - ZE_CHECK(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0])); + OV_ZE_EXPECT(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0])); return subdevices; } @@ -72,25 +72,25 @@ std::map ze_device_detector::get_available_devices(voi std::vector ze_device_detector::create_device_list(bool initialize_devices) const { std::vector ret; - ZE_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)); + OV_ZE_EXPECT(zeInit(ZE_INIT_FLAG_GPU_ONLY)); uint32_t driver_count = 0; - ZE_CHECK(zeDriverGet(&driver_count, nullptr)); + OV_ZE_EXPECT(zeDriverGet(&driver_count, nullptr)); std::vector all_drivers(driver_count); - ZE_CHECK(zeDriverGet(&driver_count, &all_drivers[0])); + OV_ZE_EXPECT(zeDriverGet(&driver_count, &all_drivers[0])); for (uint32_t i = 0; i < driver_count; ++i) { uint32_t device_count = 0; - ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, nullptr)); + OV_ZE_EXPECT(zeDeviceGet(all_drivers[i], &device_count, nullptr)); std::vector all_devices(device_count); - ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0])); + OV_ZE_EXPECT(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0])); for (uint32_t d = 0; d < device_count; ++d) { try { ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; - ZE_CHECK(zeDeviceGetProperties(all_devices[d], &device_properties)); + OV_ZE_EXPECT(zeDeviceGetProperties(all_devices[d], &device_properties)); if (ZE_DEVICE_TYPE_GPU == device_properties.type) { ret.emplace_back(std::make_shared(all_drivers[i], all_devices[d], initialize_devices)); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp index 35069b0692904f..7d99b06aa619b6 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -14,15 +14,15 @@ using namespace ze; void ze_event::reset() { event::reset(); - ZE_CHECK(zeEventHostReset(m_event)); + OV_ZE_EXPECT(zeEventHostReset(m_event)); } void ze_event::wait_impl() { - ZE_CHECK(zeEventHostSynchronize(m_event, default_timeout)); + OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout)); } void ze_event::set_impl() { - ZE_CHECK(zeEventHostSignal(m_event)); + OV_ZE_EXPECT(zeEventHostSignal(m_event)); } bool ze_event::is_set_impl() { @@ -45,7 +45,7 @@ std::optional ze_event::query_timestamp() { return std::nullopt; } ze_kernel_timestamp_result_t timestamp{}; - ZE_CHECK(zeEventQueryKernelTimestamp(m_event, ×tamp)); + OV_ZE_EXPECT(zeEventQueryKernelTimestamp(m_event, ×tamp)); return timestamp; } @@ -73,5 +73,5 @@ bool ze_event::get_profiling_info_impl(std::listm_handle, &event_desc, &event)); + OV_ZE_EXPECT(zeEventCreate(m_current_pool->m_handle, &event_desc, &event)); return std::make_shared(queue_stamp, *this, event, m_current_pool); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp index c83179bf6d720d..74e040c6831dda 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp @@ -18,7 +18,7 @@ ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_even capacity }; auto device = engine.get_device(); - ZE_CHECK(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle)); + OV_ZE_EXPECT(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle)); } ze_event_pool::~ze_event_pool() { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp index 1ac2d04a69757d..e54348e7b12c25 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp @@ -14,7 +14,7 @@ using namespace ze; void ze_events::wait_impl() { if (m_last_event) { - ZE_CHECK(zeEventHostSynchronize(m_last_event, default_timeout)); + OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, default_timeout)); } } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp index 1d9118c4d12d7d..6e046ab3a61ccf 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -40,7 +40,7 @@ class ze_kernel : public kernel { descriptor.pNext = nullptr; descriptor.flags = 0; descriptor.pKernelName = _kernel_id.c_str(); - ZE_CHECK(zeKernelCreate(_module, &descriptor, &cloned_handle)); + OV_ZE_EXPECT(zeKernelCreate(_module, &descriptor, &cloned_handle)); return std::make_shared(cloned_handle, _module, _kernel_id); } } @@ -49,10 +49,10 @@ class ze_kernel : public kernel { std::vector get_binary() const override { size_t binary_size = 0; - ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, nullptr)); + OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, nullptr)); std::vector binary(binary_size); - ZE_CHECK(zeModuleGetNativeBinary(_module, &binary_size, &binary[0])); + OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, &binary[0])); return binary; } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 3411c5489361a8..54482661e52705 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -44,7 +44,7 @@ std::vector get_ze_events(const std::vector& even allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) { ze_memory_allocation_properties_t props{ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES}; ze_device_handle_t device = nullptr; - ZE_CHECK(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device)); + OV_ZE_EXPECT(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device)); switch (props.type) { case ZE_MEMORY_TYPE_DEVICE: return allocation_type::usm_device; @@ -111,14 +111,14 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re } GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl; _host_buffer.allocateHost(_bytes_count); - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(), + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, nullptr, 0, nullptr)); - ZE_CHECK(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout)); + OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout)); _mapped_ptr = _host_buffer.get(); } else { _mapped_ptr = _buffer.get(); @@ -145,7 +145,7 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vecto auto ev_ze = downcast(ev.get())->get_handle(); std::vector temp_buffer(_bytes_count, pattern); auto ze_dep_events = get_ze_events(dep_events); - ZE_CHECK(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), + OV_ZE_EXPECT(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), _buffer.get(), temp_buffer.data(), 1, @@ -174,7 +174,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_o auto src_ptr = reinterpret_cast(data_ptr) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), dst_ptr, src_ptr, _bytes_count, @@ -202,7 +202,7 @@ event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_ auto src_ptr = reinterpret_cast(usm_mem->buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), dst_ptr, src_ptr, _bytes_count, @@ -226,7 +226,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s auto src_ptr = reinterpret_cast(buffer_ptr()) + src_offset; auto dst_ptr = reinterpret_cast(data_ptr) + dst_offset; - ZE_CHECK(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), dst_ptr, src_ptr, _bytes_count, diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp index b4368dc43b34e0..9089e8a711758b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp @@ -68,7 +68,7 @@ class UsmMemory { host_desc.pNext = nullptr; void* memory = nullptr; - ZE_CHECK(zeMemAllocHost(_context, &host_desc, size, 1, &memory)); + OV_ZE_EXPECT(zeMemAllocHost(_context, &host_desc, size, 1, &memory)); _allocate(memory); } @@ -85,7 +85,7 @@ class UsmMemory { host_desc.pNext = nullptr; void* memory = nullptr; - ZE_CHECK(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory)); + OV_ZE_EXPECT(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory)); _allocate(memory); } @@ -97,7 +97,7 @@ class UsmMemory { device_desc.pNext = nullptr; void* memory = nullptr; - ZE_CHECK(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory)); + OV_ZE_EXPECT(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory)); _allocate(memory); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 08bfea5e15f343..a16ceb097a46e7 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -212,7 +212,7 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) GPU_DEBUG_INFO << "Copy offload hint is not supported" << std::endl; } - ZE_CHECK(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); + OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } else { @@ -266,8 +266,8 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel, auto global = to_group_count(args_desc.workGroups.global); auto local = to_group_count(args_desc.workGroups.local); ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ }; - ZE_CHECK(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ)); - ZE_CHECK(zeCommandListAppendLaunchKernel(m_command_list, + OV_ZE_EXPECT(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ)); + OV_ZE_EXPECT(zeCommandListAppendLaunchKernel(m_command_list, kern, &args, set_output_event ? std::dynamic_pointer_cast(ev)->get_handle() : nullptr, @@ -278,13 +278,13 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel, } void ze_stream::enqueue_barrier() { - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); } event::ptr ze_stream::enqueue_marker(std::vector const& deps, bool is_output) { if (deps.empty()) { auto ev = create_base_event(); - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get_handle(), 0, nullptr)); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get_handle(), 0, nullptr)); return ev; } @@ -300,7 +300,7 @@ event::ptr ze_stream::enqueue_marker(std::vector const& deps, boo return create_user_event(true); auto ev = create_base_event(); - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get_handle(), static_cast(dep_events.size()), &dep_events.front())); @@ -339,7 +339,7 @@ void ze_stream::flush() const { } void ze_stream::finish() const { - ZE_CHECK(zeCommandListHostSynchronize(m_command_list, default_timeout)); + OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, default_timeout)); } void ze_stream::wait_for_events(const std::vector& events) { @@ -374,9 +374,9 @@ void ze_stream::sync_events(std::vector const& deps, bool is_output) if (is_output) { m_last_barrier_ev = std::dynamic_pointer_cast(create_base_event()); m_last_barrier_ev->set_queue_stamp(m_queue_counter.load()); - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr)); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr)); } else { - ZE_CHECK(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); } m_last_barrier = ++m_queue_counter; } From 3007f7b6f250a793983df193bbd6ba65edc0190a Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 24 Oct 2025 13:44:00 +0000 Subject: [PATCH 38/74] Add default supported simd sizes for L0 --- .../intel_gpu/src/runtime/ze/ze_device.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index d3557f9f30a914..37613e8280d547 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -83,8 +83,12 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic bool supports_dp_properties = supports_extension(extensions, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0); - ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0}; - ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, supports_ip_version ? &ip_version_properties : nullptr}; + void *device_properties_next = nullptr; + ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, device_properties_next, 0}; + if (supports_ip_version) { + device_properties_next = &ip_version_properties; + } + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, device_properties_next}; OV_ZE_EXPECT(zeDeviceGetProperties(device, &device_properties)); ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES}; @@ -142,7 +146,9 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.gpu_frequency = device_properties.coreClockRate; - info.supported_simd_sizes = {}; + // Set SIMD values as reasonable default for most of the supported platforms + // Could not find how to retrieve all supported SIMD sizes from L0 + info.supported_simd_sizes = {8, 16, 32}; info.has_separate_cache = true; info.max_work_group_size = device_compute_properties.maxTotalGroupSize; @@ -180,7 +186,8 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; - info.gfx_ver = {0, 0, 0}; // could find how to retrieve this from L0 so far + // Could not find how to retrieve gfx_ver from L0 + info.gfx_ver = {0, 0, 0}; info.ip_version = ip_version_properties.ipVersion; info.sub_device_idx = (std::numeric_limits::max)(); @@ -209,7 +216,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic } info.supports_mutable_command_list = false; - if (supports_mutable_list) { ze_mutable_command_list_exp_properties_t mutable_list_props = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_PROPERTIES, nullptr, 0, 0 }; ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &mutable_list_props}; From ddbe71b6b58855aac15aaf7e24d7888b5a4c5ae1 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 28 Oct 2025 14:21:51 +0000 Subject: [PATCH 39/74] Add L0 supported simd sizes query --- src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index 37613e8280d547..5fbbe8e4737a9b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -146,9 +146,8 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.gpu_frequency = device_properties.coreClockRate; - // Set SIMD values as reasonable default for most of the supported platforms - // Could not find how to retrieve all supported SIMD sizes from L0 - info.supported_simd_sizes = {8, 16, 32}; + info.supported_simd_sizes.resize(device_compute_properties.numSubGroupSizes); + std::copy_n(device_compute_properties.subGroupSizes, device_compute_properties.numSubGroupSizes, info.supported_simd_sizes.begin()); info.has_separate_cache = true; info.max_work_group_size = device_compute_properties.maxTotalGroupSize; From 7c01937934215fffb6bfa4f131c68caa5ec12f69 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 29 Oct 2025 15:36:36 +0000 Subject: [PATCH 40/74] Add L0 interface when building kernel selector --- src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt index 34b62d232c7e73..c5311f5be101bc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt @@ -75,6 +75,7 @@ if(COMMAND add_cpplint_target) endif() target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::OpenCL openvino::runtime PRIVATE openvino::runtime::dev) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) target_include_directories(${TARGET_NAME} PRIVATE $) From 707a23868d02f0be5750a1491798732c9abdedb0 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 30 Oct 2025 15:31:38 +0000 Subject: [PATCH 41/74] Update L0 OneDNN submodule --- src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu index 633a03d736a265..bbba69f0a58439 160000 --- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -1 +1 @@ -Subproject commit 633a03d736a2656eab1de8d7e7032711bdc4a30b +Subproject commit bbba69f0a584391d0ab25b548ec76d8c62aa11f5 From 7b2d8ae8be840d07e1436c88471225e5e13e5df2 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 3 Nov 2025 16:06:08 +0000 Subject: [PATCH 42/74] L0 retrieve global cache size --- .../intel_gpu/src/runtime/ze/ze_device.cpp | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index 5fbbe8e4737a9b..ad37f85a4c440b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -139,6 +139,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.vendor_id = device_properties.vendorId; info.dev_name = device_properties.name; + // L0 returns drivers version in different format than OCL info.driver_version = std::to_string(driver_properties.driverVersion); info.dev_type = (device_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? device_type::integrated_gpu : device_type::discrete_gpu; @@ -152,6 +153,15 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.max_work_group_size = device_compute_properties.maxTotalGroupSize; info.max_local_mem_size = device_compute_properties.maxSharedLocalMemory; + uint32_t cache_properties_count = 0; + OV_ZE_EXPECT(zeDeviceGetCacheProperties(device, &cache_properties_count, nullptr)); + info.max_global_cache_size = 0; + if (cache_properties_count > 0) { + std::vector cache_properties(cache_properties_count); + OV_ZE_EXPECT(zeDeviceGetCacheProperties(device, &cache_properties_count, cache_properties.data())); + // Assume first property is L3 cache + info.max_global_cache_size = cache_properties[0].cacheSize; + } if (mem_properties != device_memory_properties.end()) { info.max_global_mem_size = mem_properties->totalSize; @@ -164,7 +174,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.max_alloc_mem_size = device_properties.maxMemAllocSize; info.supports_image = device_image_properties.maxSamplers > 0; - info.supports_intel_planar_yuv = false; info.max_image2d_width = device_image_properties.maxImageDims2D; info.max_image2d_height = device_image_properties.maxImageDims2D; @@ -172,11 +181,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_fp64 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP64) != 0; info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0; - info.supports_khr_subgroups = true; - info.supports_intel_subgroups = true; - info.supports_intel_subgroups_short = true; - info.supports_intel_subgroups_char = true; - info.supports_intel_required_subgroup_size = true; info.supports_cp_offload = supports_cp_offload; info.supports_cb_events = supports_cb_events; @@ -185,8 +189,17 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; - // Could not find how to retrieve gfx_ver from L0 + // FIXME: Could not find how to retrieve those from L0 info.gfx_ver = {0, 0, 0}; + info.supports_work_group_collective_functions = false; + info.supports_intel_planar_yuv = false; + info.supports_khr_subgroups = true; + info.supports_intel_subgroups = true; + info.supports_intel_subgroups_short = true; + info.supports_intel_subgroups_char = true; + info.supports_intel_required_subgroup_size = true; + info.supports_queue_families = true; + info.ip_version = ip_version_properties.ipVersion; info.sub_device_idx = (std::numeric_limits::max)(); @@ -197,7 +210,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.num_threads_per_eu = device_properties.numThreadsPerEU; info.num_ccs = compute_queue_props->numQueues; - info.supports_queue_families = true; info.kernel_timestamp_valid_bits = device_properties.kernelTimestampValidBits; info.timer_resolution = device_properties.timerResolution; From cba0d29068c7ed29c7421f40a70d73736c08a2d8 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 4 Nov 2025 16:33:43 +0000 Subject: [PATCH 43/74] Add L0 build kernel API wip --- .../include/intel_gpu/runtime/engine.hpp | 7 +- .../include/intel_gpu/runtime/kernel.hpp | 6 ++ .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 4 ++ .../intel_gpu/src/runtime/ocl/ocl_engine.hpp | 2 +- .../intel_gpu/src/runtime/ze/ze_common.hpp | 5 ++ .../intel_gpu/src/runtime/ze/ze_engine.cpp | 67 +++++++++++++++---- .../intel_gpu/src/runtime/ze/ze_engine.hpp | 2 +- .../intel_gpu/src/runtime/ze/ze_kernel.hpp | 39 +++++------ .../src/runtime/ze/ze_kernel_holder.hpp | 33 +++++++++ .../src/runtime/ze/ze_module_holder.hpp | 29 ++++++++ 10 files changed, 155 insertions(+), 39 deletions(-) create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index 67648d15fe9994..657526d6174348 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -145,6 +145,9 @@ class engine { virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0; + // Build kernels for current engine and append them to output vector. + virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const = 0; + void set_enable_large_allocations(bool enable_large_allocations); bool get_enable_large_allocations() const; @@ -157,10 +160,6 @@ class engine { virtual dnnl::engine& get_onednn_engine() const = 0; #endif - /// This method is intended to create kernel handle for current engine from handle from arbitrary engine - /// For instance, source kernel can be compiled using ocl engine, and then we can build L0 kernel object based on that - virtual kernel::ptr prepare_kernel(const kernel::ptr kernel) const = 0; - /// Factory method which creates engine object with impl configured by @p engine_type /// @param engine_type requested engine type /// @param runtime_type requested execution runtime for the engine. @note some runtime/engine types configurations might be unsupported diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp index 6c55df8507c812..b6a1a2dc817209 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp @@ -11,6 +11,12 @@ namespace cldnn { using kernel_id = std::string; +enum class KernelFormat { + SOURCE, + IL, + NATIVE_BIN, +}; + class kernel { public: using ptr = std::shared_ptr; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 5e7b1c63dddc86..9917ddfe6712a1 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -307,6 +307,10 @@ void* ocl_engine::get_user_context() const { return static_cast(cl_device.get_context().get()); } +std::vector ocl_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options) const { + cl::Program() +} + kernel::ptr ocl_engine::prepare_kernel(const kernel::ptr kernel) const { OPENVINO_ASSERT(downcast(kernel.get()) != nullptr); return kernel; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index 0fa77922f03442..ca505d1ce16812 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -46,7 +46,7 @@ class ocl_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; stream& get_service_stream() const override; - kernel::ptr prepare_kernel(const kernel::ptr kernel) const override; + void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp index 3961bd5776a11e..4ff1100a7dd9e9 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -3,9 +3,13 @@ // #pragma once +#include "intel_gpu/runtime/debug_configuration.hpp" + #include #include +#include + // Expect success of level zero command, throw runtime error otherwise #define OV_ZE_EXPECT(f) \ @@ -29,6 +33,7 @@ namespace cldnn { namespace ze { static constexpr uint64_t default_timeout = std::numeric_limits::max(); +static constexpr ze_module_format_t ze_module_format_oclc = (ze_module_format_t) 3U; } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index f9e1a4874324ab..48fdade2cdbab9 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -12,6 +12,9 @@ #include "ze_memory.hpp" #include "ze_stream.hpp" #include "ze_device.hpp" +#include "ze_kernel.hpp" +#include "ze_module_holder.hpp" +#include "ze_kernel_holder.hpp" #include #include #include @@ -38,7 +41,7 @@ ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& auto ze_device = engine.get_device(); auto ze_ctx = engine.get_context(); - zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr); + OV_ZE_EXPECT(zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr)); return ze_module; } @@ -219,17 +222,57 @@ bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) { return (reinterpret_cast(mem1).get_buffer().get() == reinterpret_cast(mem2).get_buffer().get()); } -kernel::ptr ze_engine::prepare_kernel(const kernel::ptr kernel) const { - if (std::dynamic_pointer_cast(kernel)) { - return kernel; - } else { - auto binary = kernel->get_binary(); - ze_module_handle_t ze_module = ze_create_module_with_level_zero(*this, binary); - ze_kernel_handle_t ze_kernel; - auto entry_point = kernel->get_id(); - ze_kernel_desc_t desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC , nullptr, 0, entry_point.c_str()}; - zeKernelCreate(ze_module, &desc, &ze_kernel); - return std::make_shared(ze_kernel, ze_module, entry_point); +void ze_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const { + ze_module_desc_t module_desc = { + ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_NATIVE, + src_bytes, + reinterpret_cast(src), + options.c_str(), + nullptr // specialization constants + }; + switch (src_format) + { + case KernelFormat::SOURCE: + module_desc.format = ze_module_format_oclc; + break; + case KernelFormat::IL: + module_desc.format = ZE_MODULE_FORMAT_IL_SPIRV; + break; + case KernelFormat::NATIVE_BIN: + module_desc.format = ZE_MODULE_FORMAT_NATIVE; + break; + default: + OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format"); + break; + } + ze_module_handle_t module_handle; + ze_module_build_log_handle_t log_handle; + ze_result_t build_result = zeModuleCreate(get_context(), get_device(), &module_desc, &module_handle, &log_handle); + if (build_result != ZE_RESULT_SUCCESS) { + size_t log_size = 0; + OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr)); + std::string log(log_size, ' '); + OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data())); + OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle)); + OPENVINO_THROW(log); + } + auto module_holder = std::make_shared(module_handle); + OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle)); + uint32_t kernel_count = 0; + OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr)); + std::vector kernel_names(kernel_count); + OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data())); + ze_kernel_flags_t flags = 0; + ze_kernel_desc_t kernel_desc = { + ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr}; + for (auto name : kernel_names) { + kernel_desc.pKernelName = name; + ze_kernel_handle_t kernel_handle; + OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle)); + auto kernel_holder = std::make_shared(kernel_handle, module_holder); + out.push_back(std::make_shared(kernel_holder, std::string(name))); } } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp index b75d2ae0ca67eb..fb86467fb9cc07 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -41,7 +41,7 @@ class ze_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; stream& get_service_stream() const override; - kernel::ptr prepare_kernel(const kernel::ptr kernel) const override; + void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp index 6e046ab3a61ccf..61bc03b46f85e2 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -7,6 +7,7 @@ #include "intel_gpu/runtime/kernel.hpp" #include "openvino/core/except.hpp" #include "ze_common.hpp" +#include "ze_kernel_holder.hpp" #include @@ -14,48 +15,44 @@ namespace cldnn { namespace ze { class ze_kernel : public kernel { - ze_kernel_handle_t _compiled_kernel; - ze_module_handle_t _module; - std::string _kernel_id; - public: - ze_kernel(ze_kernel_handle_t compiled_kernel, ze_module_handle_t module, const std::string& kernel_id) - : _compiled_kernel(compiled_kernel) - , _module(module) - , _kernel_id(kernel_id) { } + ze_kernel(std::shared_ptr kernel, const std::string& kernel_id) + : m_kernel(kernel) + , m_kernel_id(kernel_id) { } - ~ze_kernel() { - zeKernelDestroy(_compiled_kernel); - } + ze_kernel_handle_t get_kernel() { return m_kernel->get_kernel(); } + ze_module_handle_t get_module() { return m_kernel->get_module(); } + std::string get_id() const override { return m_kernel_id; } - const ze_kernel_handle_t& get_handle() const { return _compiled_kernel; } - ze_kernel_handle_t& get_handle() { return _compiled_kernel; } std::shared_ptr clone(bool reuse_kernel_handle = false) const override { if (reuse_kernel_handle) { - return std::make_shared(_compiled_kernel, _module, _kernel_id); + return std::make_shared(m_kernel, m_kernel_id); } else { ze_kernel_handle_t cloned_handle; + ze_module_handle_t module_handle = m_kernel->get_module(); ze_kernel_desc_t descriptor; descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; descriptor.pNext = nullptr; descriptor.flags = 0; - descriptor.pKernelName = _kernel_id.c_str(); - OV_ZE_EXPECT(zeKernelCreate(_module, &descriptor, &cloned_handle)); - return std::make_shared(cloned_handle, _module, _kernel_id); + descriptor.pKernelName = m_kernel_id.c_str(); + OV_ZE_EXPECT(zeKernelCreate(module_handle, &descriptor, &cloned_handle)); + return std::make_shared(cloned_handle, module_handle, m_kernel_id); } } - std::string get_id() const override { return _kernel_id; } - std::vector get_binary() const override { size_t binary_size = 0; - OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, nullptr)); + ze_module_handle_t module_handle = m_kernel->get_module(); + OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, nullptr)); std::vector binary(binary_size); - OV_ZE_EXPECT(zeModuleGetNativeBinary(_module, &binary_size, &binary[0])); + OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, binary.data())); return binary; } +private: + std::shared_ptr m_kernel; + std::string m_kernel_id; }; } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp new file mode 100644 index 00000000000000..72c98c4b9283e7 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_common.hpp" +#include "ze_module_holder.hpp" + +#include + +namespace cldnn { +namespace ze { + +// RAII wrapper for Level Zero kernel +class ze_kernel_holder { +public: + // Take ownership of existing kernel handle + explicit ze_kernel_holder(ze_kernel_handle_t kernel, std::shared_ptr module) : m_kernel(kernel), m_module(module) {} + ze_kernel_holder(const ze_kernel_holder& other) = delete; + ze_kernel_holder& operator=(const ze_kernel_holder& other) = delete; + ~ze_kernel_holder() { + OV_ZE_WARN(zeKernelDestroy(m_kernel)); + } + ze_kernel_handle_t get_kernel() { return m_kernel; } + ze_module_handle_t get_module() { return m_module->get_module(); } +private: + ze_kernel_handle_t m_kernel; + std::shared_ptr m_module; +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp new file mode 100644 index 00000000000000..9288584fc5c681 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_common.hpp" + +namespace cldnn { +namespace ze { + +// RAII wrapper for Level Zero module +class ze_module_holder { +public: + // Take ownership of existing module handle + explicit ze_module_holder(ze_module_handle_t handle) : m_handle(handle) {} + + ze_module_holder(const ze_module_holder& other) = delete; + ze_module_holder& operator=(const ze_module_holder& other) = delete; + ~ze_module_holder() { + OV_ZE_WARN(zeModuleDestroy(m_handle)); + } + ze_module_handle_t get_module() { return m_handle; } +private: + ze_module_handle_t m_handle; + +}; +} // namespace ze +} // namespace cldnn From 2f855dafa4a69027b23446c70ec5e8f19e784541 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 6 Nov 2025 16:29:30 +0000 Subject: [PATCH 44/74] Add L0 kernel build logic --- .../util/include/openvino/util/file_util.hpp | 2 +- src/common/util/src/file_util.cpp | 2 +- .../include/intel_gpu/runtime/engine.hpp | 6 +- .../include/intel_gpu/runtime/file_util.hpp | 2 +- .../include/intel_gpu/runtime/kernel.hpp | 8 +- .../intel_gpu/runtime/kernel_builder.hpp | 25 ++ .../src/graph/impls/ocl/kernels_cache.cpp | 292 +++++------------- .../src/graph/impls/ocl/kernels_cache.hpp | 4 +- .../graph/impls/ocl/multi_stage_primitive.hpp | 3 +- .../src/graph/impls/ocl/primitive_base.hpp | 3 +- .../intel_gpu/src/runtime/file_util.cpp | 2 +- .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 14 +- .../intel_gpu/src/runtime/ocl/ocl_engine.hpp | 2 +- .../intel_gpu/src/runtime/ocl/ocl_kernel.cpp | 11 + .../intel_gpu/src/runtime/ocl/ocl_kernel.hpp | 1 + .../src/runtime/ocl/ocl_kernel_builder.hpp | 78 +++++ .../intel_gpu/src/runtime/ze/ze_engine.cpp | 79 +---- .../intel_gpu/src/runtime/ze/ze_engine.hpp | 2 +- .../intel_gpu/src/runtime/ze/ze_kernel.hpp | 43 ++- .../src/runtime/ze/ze_kernel_builder.hpp | 67 ++++ .../src/runtime/ze/ze_kernel_holder.hpp | 7 +- .../src/runtime/ze/ze_module_holder.hpp | 17 +- .../intel_gpu/src/runtime/ze/ze_stream.cpp | 4 +- 23 files changed, 331 insertions(+), 343 deletions(-) create mode 100644 src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp create mode 100644 src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp diff --git a/src/common/util/include/openvino/util/file_util.hpp b/src/common/util/include/openvino/util/file_util.hpp index 6eee9c3da0c102..88407538a36cc8 100644 --- a/src/common/util/include/openvino/util/file_util.hpp +++ b/src/common/util/include/openvino/util/file_util.hpp @@ -316,7 +316,7 @@ std::vector load_binary(const std::string& path); * @brief save binary data to file * @param path - binary file path to store */ -void save_binary(const std::string& path, std::vector binary); +void save_binary(const std::string& path, const std::vector &binary); void save_binary(const std::string& path, const char* binary, size_t bin_size); /** diff --git a/src/common/util/src/file_util.cpp b/src/common/util/src/file_util.cpp index 5436f26f03014a..db39f3a61749d2 100644 --- a/src/common/util/src/file_util.cpp +++ b/src/common/util/src/file_util.cpp @@ -515,7 +515,7 @@ std::vector ov::util::load_binary(const std::string& path) { return {}; } -void ov::util::save_binary(const std::string& path, std::vector binary) { +void ov::util::save_binary(const std::string& path, const std::vector &binary) { save_binary(path, reinterpret_cast(&binary[0]), binary.size()); return; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index 657526d6174348..1d99a1ae430f80 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -12,6 +12,7 @@ #include "layout.hpp" #include "execution_config.hpp" #include "engine_configuration.hpp" +#include "kernel_builder.hpp" #include #include @@ -143,10 +144,9 @@ class engine { /// Returns service stream which can be used during program build and optimizations virtual stream& get_service_stream() const = 0; - virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0; + virtual std::shared_ptr create_kernel_builder() const = 0; - // Build kernels for current engine and append them to output vector. - virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const = 0; + virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0; void set_enable_large_allocations(bool enable_large_allocations); diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp index 934d9deecd2bd0..91fc07c2ab2555 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/file_util.hpp @@ -12,6 +12,6 @@ namespace ov::intel_gpu { // Version of save_binary that don't trow an exception if attempt to open file fails -void save_binary(const std::string& path, std::vector binary); +void save_binary(const std::string& path, const std::vector &binary); } // namespace ov::intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp index b6a1a2dc817209..54bece374b99f2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp @@ -6,17 +6,12 @@ #include #include +#include namespace cldnn { using kernel_id = std::string; -enum class KernelFormat { - SOURCE, - IL, - NATIVE_BIN, -}; - class kernel { public: using ptr = std::shared_ptr; @@ -25,6 +20,7 @@ class kernel { virtual std::string get_id() const = 0; virtual std::vector get_binary() const = 0; + virtual std::string get_build_log() const = 0; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp new file mode 100644 index 00000000000000..90c17a77be03cf --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel.hpp" + +#include +#include +#include + +namespace cldnn { + +enum class KernelFormat { + SOURCE, + NATIVE_BIN, +}; + +class kernel_builder { +public: + virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector &out) const = 0; +}; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp index fc282231b22e28..94e4df64ea8090 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp @@ -20,10 +20,6 @@ #include "intel_gpu/runtime/itt.hpp" #include "intel_gpu/runtime/file_util.hpp" -#include "ocl/ocl_kernel.hpp" -#include "ocl/ocl_common.hpp" -#include "ocl/ocl_device.hpp" - #ifdef WIN32 #include #ifdef NTDDI_WIN10_RS5 @@ -54,41 +50,18 @@ namespace { std::mutex cacheAccessMutex; -static const cldnn::device::ptr get_target_device(const cldnn::engine& engine) { - using namespace cldnn; - if (engine.runtime_type() == runtime_types::ocl) { - return engine.get_device(); - } else { - ocl::ocl_device_detector detector; - auto device_map = detector.get_available_devices(nullptr, nullptr); - auto original_device = engine.get_device(); - - for (auto& d : device_map) { - const auto& target_uuid = d.second->get_info().uuid; - const auto& original_uuid = original_device->get_info().uuid; - if (target_uuid.uuid == original_uuid.uuid) - return d.second; - } +std::string join_strings(const std::vector strings) { + size_t total_size = 0; + for (auto &str : strings) { + total_size += str.size(); } - - OPENVINO_THROW("[GPU] Couldn't find target device for kernels cache"); -} - -#ifdef ENABLE_ONEDNN_FOR_GPU -cl::Program fuse_microkernels(const cl::Context& context, const cl::Device& device, cl::Program& program, const std::string& code) { - using namespace dnnl::impl::gpu::intel; - std::vector> binaries = program.getInfo(); - OPENVINO_ASSERT(binaries.size() == 1); - std::vector binary = binaries[0]; - micro::fuseMicrokernels(binary, code.c_str()); - - cl::Program::Binaries fused_binary = { binary }; - cl::Program fused_program(context, {device}, fused_binary); - fused_program.build({device}); - - return fused_program; + std::string acc_str; + acc_str.reserve(total_size); + for (auto &str : strings) { + acc_str.append(str); + } + return acc_str; } -#endif // ENABLE_ONEDNN_FOR_GPU std::string reorder_options(const std::string& org_options) { std::stringstream ss(org_options); @@ -307,42 +280,21 @@ kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, std::shared_ptr task_executor, const std::map& batch_headers) - : _device(get_target_device(engine)) + : _device(engine.get_device()) + , _builder(engine.create_kernel_builder()) , _task_executor(task_executor) , _config(config) , _prog_id(prog_id) , batch_headers(std::move(batch_headers)) { } -static std::vector getProgramBinaries(cl::Program program) { - // Get the size of the program binary in bytes. - std::vector binary_sizes = program.getInfo(); - - if (binary_sizes.size() != 1) - throw std::runtime_error("Invalid binaries count"); - - size_t binary_size = binary_sizes.front(); - // Binary is not available for the device. - if (binary_size == 0) - throw std::runtime_error("Binary is not avaliable after program build"); - - // Get program binary. - return program.getInfo().front(); -} - -// TODO: This build_batch method should be backend specific void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& compiled_kernels) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch"); - - auto& cl_build_device = dynamic_cast(*_device); - bool dump_sources = batch.dump_custom_program; std::string dump_sources_dir = GPU_DEBUG_VALUE_OR(_config.get_dump_sources_path(), ""); GPU_DEBUG_IF(!dump_sources_dir.empty()) { dump_sources = true; } - std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which - std::string current_dump_file_name = ""; if (dump_sources) { current_dump_file_name = std::move(dump_sources_dir); @@ -361,128 +313,71 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co dump_file << s; } } - std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache"; - cl::Program::Binaries precompiled_kernels = {}; - + /////////////////////////////////////////////////////////////////////////////////// + std::vector precompiled; if (is_cache_enabled()) { - // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket - // If read is successful, then remove kernels from compilation bucket - std::vector bin; - { - std::lock_guard lock(cacheAccessMutex); - bin = ov::util::load_binary(cached_bin_name); - } - if (!bin.empty()) { - precompiled_kernels.push_back(bin); - } + std::lock_guard lock(cacheAccessMutex); + precompiled = ov::util::load_binary(cached_bin_name); } - try { - cl::vector kernels; - - // Run compilation - if (precompiled_kernels.empty()) { - cl::Program program(cl_build_device.get_context(), batch.source); - { - OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildProgram::RunCompilation"); - if (program.build({cl_build_device.get_device()}, batch.options.c_str()) != CL_SUCCESS) - throw std::runtime_error("Failed in building program."); - } - - if (dump_sources && dump_file.good()) { - dump_file << "\n/* Build Log:\n"; - for (auto& p : program.getBuildInfo()) - dump_file << p.second << "\n"; - - dump_file << "*/\n"; + std::vector kernels; + if (!precompiled.empty()) { + _builder->build_kernels(precompiled.data(), precompiled.size(), KernelFormat::NATIVE_BIN, "", kernels); + } else { + auto combined_source = join_strings(batch.source); + _builder->build_kernels(combined_source.data(), combined_source.size(), KernelFormat::SOURCE, batch.options, kernels); + if (dump_sources && dump_file.good()) { + dump_file << "\n/* Build Log:\n"; + // Retreive build log from the first kernel only + // It should be the same for all kernels in batch + if (kernels.size() > 1) { + dump_file << kernels[0]->get_build_log(); } - - if (batch.has_microkernels) { + dump_file << "\n*/\n"; + } + if (batch.has_microkernels) { #ifdef ENABLE_ONEDNN_FOR_GPU - OPENVINO_ASSERT(batch.kernels_counter == 1); - // Do we need full source code here (with batch headers)? - program = fuse_microkernels(cl_build_device.get_context(), cl_build_device.get_device(), program, batch.source.back()); + using namespace dnnl::impl::gpu::intel; + OPENVINO_ASSERT(batch.kernels_counter == 1 && kernels.size() == 1); + std::vector binary = kernels[0]->get_binary(); + kernels.clear(); + // Update binary and rebuild kernel + micro::fuseMicrokernels(binary, combined_source.c_str()); + _builder->build_kernels(binary.data(), binary.size(), KernelFormat::NATIVE_BIN, "", kernels); #else // ENABLE_ONEDNN_FOR_GPU - OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available"); + OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available"); #endif // ENABLE_ONEDNN_FOR_GPU - } - - - program.createKernels(&kernels); - - if (is_cache_enabled()) { + } + if (is_cache_enabled()) { // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited // Bucket size can be changed by max_kernels_per_batch config option, but forcing it to 1 will lead to much longer // compile time. + std::vector binary = kernels[0]->get_binary(); std::lock_guard lock(cacheAccessMutex); - ov::intel_gpu::save_binary(cached_bin_name, getProgramBinaries(std::move(program))); - } - } else { - cl::Program program(cl_build_device.get_context(), {cl_build_device.get_device()}, precompiled_kernels); - if (program.build({cl_build_device.get_device()}, batch.options.c_str()) != CL_SUCCESS) - throw std::runtime_error("Failed in building program with a precompiled kernel."); - - program.createKernels(&kernels); + ov::intel_gpu::save_binary(cached_bin_name, binary); } - - { - std::lock_guard lock(_mutex); - for (auto& k : kernels) { - const auto& entry_point = k.getInfo(); - const auto& iter = batch.entry_point_to_id.find(entry_point); - if (iter != batch.entry_point_to_id.end()) { - kernel::ptr kernel = std::make_shared(ocl::ocl_kernel_type(k, cl_build_device.get_usm_helper()), entry_point); - - auto& params = iter->second.first; - auto kernel_part_idx = iter->second.second; - if (compiled_kernels.find(params) != compiled_kernels.end()) { - compiled_kernels[params].push_back(std::make_pair(kernel, kernel_part_idx)); - } else { - compiled_kernels[params] = { std::make_pair(kernel, kernel_part_idx) }; - } - if (_kernel_batch_hash.find(params) == _kernel_batch_hash.end()) { - _kernel_batch_hash[params] = batch.hash_value; - } + } + { + std::lock_guard lock(_mutex); + for (auto& k : kernels) { + auto entry_point = k->get_id(); + const auto& iter = batch.entry_point_to_id.find(entry_point); + if (iter != batch.entry_point_to_id.end()) { + auto& params = iter->second.first; + auto kernel_part_idx = iter->second.second; + if (compiled_kernels.find(params) != compiled_kernels.end()) { + compiled_kernels[params].push_back(std::make_pair(k, kernel_part_idx)); } else { - throw std::runtime_error("Could not find entry point"); + compiled_kernels[params] = { std::make_pair(k, kernel_part_idx) }; } + if (_kernel_batch_hash.find(params) == _kernel_batch_hash.end()) { + _kernel_batch_hash[params] = batch.hash_value; + } + } else { + throw std::runtime_error("Could not find entry point"); } } - } catch (const cl::BuildError& err) { - if (dump_sources && dump_file.good()) - dump_file << "\n/* Build Log:\n"; - - for (auto& p : err.getBuildLog()) { - if (dump_sources && dump_file.good()) - dump_file << p.second << "\n"; - err_log += p.second + '\n'; - } - if (dump_sources && dump_file.good()) - dump_file << "*/\n"; - } - if (!err_log.empty()) { - GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl; - GPU_DEBUG_INFO << err_log << std::endl; - GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl; - std::stringstream err_ss(err_log); - std::string line; - std::stringstream err; - int cnt = 0; - - while (std::getline(err_ss, line, '\n')) { - if (line.find("error") != std::string::npos) - cnt = 5; - cnt--; - if (cnt > 0) - err << line << std::endl; - else if (cnt == 0) - err << "...." << std::endl; - } - - throw std::runtime_error("Program build failed(" + std::to_string(batch.bucket_id) + + "_part_" - + std::to_string(batch.batch_id) - + "):\n" + err.str()); } } @@ -490,7 +385,7 @@ kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const auto res = _cached_kernels.find(id); OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!"); - return e.prepare_kernel(res->second->clone(_reuse_kernels)); + return res->second->clone(_reuse_kernels); } std::vector kernels_cache::get_kernels(const kernel_impl_params& params) const { @@ -504,53 +399,15 @@ std::vector kernels_cache::get_kernels(const kernel_impl_params& pa OPENVINO_ASSERT(_kernels.end() != res, "Kernel for {" + current_node_id + "} is not found in the kernel cache!"); OPENVINO_ASSERT(res->second.size() != 0, "Number of kernels should not be zero for " + current_node_id); - auto& engine = params.get_program().get_engine(); - std::vector kernels(res->second.size()); for (auto& k : res->second) { auto& kernel_ptr = k.first; auto kernel_part_idx = k.second; - kernels[kernel_part_idx] = engine.prepare_kernel(kernel_ptr->clone(_reuse_kernels)); + kernels[kernel_part_idx] = kernel_ptr->clone(_reuse_kernels); } return kernels; } -bool kernels_cache::validate_simple_kernel_execution(kernel::ptr krl) { - auto casted = downcast(krl.get()); - auto kernel = casted->get_handle(); - try { - auto casted_dev = dynamic_cast(_device.get()); - OPENVINO_ASSERT(casted_dev != nullptr, "device is nullptr"); - - auto device = casted_dev->get_device(); - cl::Context ctx(device); - - cl::Buffer buffer(ctx, CL_MEM_READ_WRITE, sizeof(uint8_t) * 8); - if (kernel.setArg(0, buffer) != CL_SUCCESS) - return false; - - cl::Event ev; - cl::CommandQueue queue(ctx, device); - if (queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(8), cl::NDRange(8), nullptr, &ev) != CL_SUCCESS) - return false; - - uint8_t result[8]; - uint8_t expected[8] = { 1, 3, 5, 7, 9, 11, 13, 15 }; - if (queue.enqueueReadBuffer(buffer, CL_TRUE, 0, sizeof(uint8_t) * 8, &result) != CL_SUCCESS) - return false; - - for (int i = 0; i < 8; ++i) { - if (result[i] != expected[i]) - return false; - } - - ev.wait(); - return true; - } catch (...) { - return false; - } -} - void kernels_cache::build_all() { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll"); if (!_pending_compilation) @@ -693,7 +550,7 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const { ob << cached_binary.first; ob << is_zebin_binary; if (!is_zebin_binary) { - auto driver_version = downcast(*_device).get_info().driver_version; + auto driver_version = _device->get_info().driver_version; ob << driver_version; } } @@ -702,8 +559,6 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const { void kernels_cache::load(BinaryInputBuffer& ib) { std::unordered_map> precompiled_kernels; - const auto& build_device = downcast(*_device); - size_t num_cached_binaries; ib >> num_cached_binaries; for (size_t i = 0; i < num_cached_binaries; ++i) { @@ -717,7 +572,7 @@ void kernels_cache::load(BinaryInputBuffer& ib) { // Legacy patchtoken path std::string driver_version, current_driver_version; ib >> driver_version; - current_driver_version = build_device.get_info().driver_version; + current_driver_version = _device->get_info().driver_version; if (driver_version != current_driver_version) { OPENVINO_THROW("Driver version mismatch in cached patchtoken kernels"); @@ -725,31 +580,22 @@ void kernels_cache::load(BinaryInputBuffer& ib) { } } - try { + { std::lock_guard lock(_mutex); _cached_kernels.clear(); for (auto& precompiled_kernel : precompiled_kernels) { - cl::vector kernels; - cl::Program program(build_device.get_context(), {build_device.get_device()}, {precompiled_kernel.second}); - program.build({build_device.get_device()}); - program.createKernels(&kernels); - + std::vector kernels; + _builder->build_kernels(precompiled_kernel.second.data(), precompiled_kernel.second.size(), KernelFormat::NATIVE_BIN, "", kernels); for (auto& k : kernels) { - const auto& entry_point = k.getInfo(); + const auto& entry_point = k->get_id(); std::string cached_kernel_id = entry_point + "@" + std::to_string(precompiled_kernel.first); const auto& iter = _cached_kernels.find(cached_kernel_id); if (iter == _cached_kernels.end()) { - _cached_kernels[cached_kernel_id] = std::make_shared(ocl::ocl_kernel_type(k, build_device.get_usm_helper()), entry_point); + _cached_kernels[cached_kernel_id] = k; } } } - } catch (const cl::BuildError& err) { - std::string err_log = ""; - for (auto& p : err.getBuildLog()) { - err_log += p.second + '\n'; - } - OPENVINO_THROW(err_log); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp index e623c547bf691d..ee65b68be0138c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp @@ -7,6 +7,7 @@ #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/runtime/device.hpp" #include "intel_gpu/runtime/kernel.hpp" +#include "intel_gpu/runtime/kernel_builder.hpp" #include "intel_gpu/runtime/execution_config.hpp" #include "intel_gpu/graph/kernel_impl_params.hpp" @@ -97,6 +98,7 @@ class kernels_cache { private: static std::mutex _mutex; const device::ptr _device; + std::shared_ptr _builder; std::shared_ptr _task_executor; ExecutionConfig _config; uint32_t _prog_id = 0; @@ -127,8 +129,6 @@ class kernels_cache { void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; } bool get_kernels_reuse() const { return _reuse_kernels; } - bool validate_simple_kernel_execution(kernel::ptr kernel); - // forces compilation of all pending kernels/programs void build_all(); void reset(); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp index 482184340171c5..3da69b2aa48bfc 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp @@ -221,12 +221,11 @@ struct multi_stage_primitive : public typed_primitive_impl { void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); auto& kernel_vec = kernels.begin()->second; - auto& engine = kernels.begin()->first.get_program().get_engine(); _kernels.clear(); _kernels.resize(kernel_vec.size()); for (auto& k : kernel_vec) { auto sub_kernel_idx = k.second; - _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first); + _kernels[sub_kernel_idx] = k.first; } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 40f04650658264..b897bbac550196 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -297,12 +297,11 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); auto& kernel_vec = kernels.begin()->second; - auto& engine = kernels.begin()->first.get_program().get_engine(); _kernels.clear(); _kernels.resize(kernel_vec.size()); for (auto& k : kernel_vec) { auto sub_kernel_idx = k.second; - _kernels[sub_kernel_idx] = engine.prepare_kernel(k.first); + _kernels[sub_kernel_idx] = k.first; } } diff --git a/src/plugins/intel_gpu/src/runtime/file_util.cpp b/src/plugins/intel_gpu/src/runtime/file_util.cpp index e73397f1e51844..fc64b8c8af200c 100644 --- a/src/plugins/intel_gpu/src/runtime/file_util.cpp +++ b/src/plugins/intel_gpu/src/runtime/file_util.cpp @@ -7,7 +7,7 @@ namespace ov::intel_gpu { -void save_binary(const std::string &path, std::vector binary) { +void save_binary(const std::string &path, const std::vector &binary) { try { ov::util::save_binary(path, binary); } catch (std::runtime_error&) {} diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 9917ddfe6712a1..b3a5c2266f165e 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -4,7 +4,8 @@ #include "ocl_engine.hpp" #include "intel_gpu/runtime/utils.hpp" -#include "ocl/ocl_kernel.hpp" +#include "ocl_kernel.hpp" +#include "ocl_kernel_builder.hpp" #include "ocl_common.hpp" #include "ocl_memory.hpp" #include "ocl_stream.hpp" @@ -307,13 +308,10 @@ void* ocl_engine::get_user_context() const { return static_cast(cl_device.get_context().get()); } -std::vector ocl_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options) const { - cl::Program() -} - -kernel::ptr ocl_engine::prepare_kernel(const kernel::ptr kernel) const { - OPENVINO_ASSERT(downcast(kernel.get()) != nullptr); - return kernel; +std::shared_ptr ocl_engine::create_kernel_builder() const { + auto cl_device = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(cl_device, "[GPU] Invalid device type for ocl_engine"); + return std::make_shared(*cl_device); } bool ocl_engine::extension_supported(std::string extension) const { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index ca505d1ce16812..84908c7addc969 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -46,7 +46,7 @@ class ocl_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; stream& get_service_stream() const override; - void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const override; + std::shared_ptr create_kernel_builder() const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp index 417a0286df7252..7a59b3c4169b3b 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp @@ -38,5 +38,16 @@ std::vector ocl_kernel::get_binary() const { return binary; } +std::string ocl_kernel::get_build_log() const { + auto program = _compiled_kernel.getInfo(); + auto log = program.getBuildInfo(); + // Assume program was build for only 1 device + // Return first log + if (log.size() > 0) { + return log[0].second; + } + OPENVINO_THROW("[GPU] Failed to retrieve kernel build log"); +} + } // namespace ocl } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp index 035182f664df4d..d46fda1ee02f15 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp @@ -35,6 +35,7 @@ class ocl_kernel : public kernel { } std::vector get_binary() const override; + std::string get_build_log() const override; }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp new file mode 100644 index 00000000000000..7ded6cf5b8e837 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2016-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/kernel_builder.hpp" +#include "intel_gpu/runtime/device.hpp" + +#include "ocl_device.hpp" +#include "ocl_kernel.hpp" + + +namespace cldnn { +namespace ocl { + +class ocl_kernel_builder : public kernel_builder{ + public: + ocl_kernel_builder(const ocl_device &device) : m_device(device) {} + + void build_kernels(const void *src, + size_t src_bytes, + KernelFormat src_format, + const std::string &options, + std::vector &out) const override { + auto context = m_device.get_context().get(); + + cl_program program_handle; + cl_int err = CL_INVALID_VALUE; + switch (src_format) { + case KernelFormat::SOURCE: { + const char **strings = reinterpret_cast(&src); + const size_t *lenghts = &src_bytes; + const cl_uint count = 1; + program_handle = clCreateProgramWithSource(context, count, strings, lenghts, &err); + break; + } + case KernelFormat::NATIVE_BIN: { + const unsigned char **binaries = reinterpret_cast(&src); + const size_t *lenghts = &src_bytes; + const cl_device_id device_id = m_device.get_device().get(); + const cl_uint count = 1; + program_handle = clCreateProgramWithBinary(context, count, &device_id, lenghts, binaries, nullptr, &err); + break; + } + default: + OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format"); + break; + } + if (err != CL_SUCCESS) { + OPENVINO_THROW("[GPU] Failed to create program during kernel build process"); + } + cl::Program program(program_handle); + if (program.build(m_device.get_device(), options.c_str()) != CL_SUCCESS) { + GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl; + auto log = program.getBuildInfo(); + for (auto &e : log) { + GPU_DEBUG_INFO << e.second; + } + GPU_DEBUG_INFO << "-------- End of Kernel build error" << std::endl; + OPENVINO_THROW("[GPU] Failed to build program"); + } + cl::vector kernels; + if (program.createKernels(&kernels) != CL_SUCCESS) { + OPENVINO_THROW("[GPU] Failed to create kernels"); + } + for (auto& k : kernels) { + const auto &entry_point = k.getInfo(); + out.push_back(std::make_shared(ocl::ocl_kernel_type(k, m_device.get_usm_helper()), entry_point)); + } + } + + private: + const ocl_device &m_device; +}; +} // namespace ocl +} // namespace cldnn + diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 48fdade2cdbab9..45c453009454e9 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -5,7 +5,7 @@ #include "ze_engine.hpp" #include "intel_gpu/runtime/utils.hpp" #include "openvino/core/except.hpp" -#include "ze/ze_kernel.hpp" +#include "ze_kernel_builder.hpp" #include "ze_api.h" #include "ze_engine_factory.hpp" #include "ze_common.hpp" @@ -26,27 +26,6 @@ namespace cldnn { namespace ze { -namespace { - -ze_module_handle_t ze_create_module_with_level_zero(const cldnn::ze::ze_engine& engine, std::vector binary) { - auto desc = ze_module_desc_t(); - desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; - desc.format = ZE_MODULE_FORMAT_NATIVE; - desc.inputSize = binary.size(); - desc.pInputModule = binary.data(); - desc.pBuildFlags = ""; - desc.pConstants = nullptr; - - ze_module_handle_t ze_module; - - auto ze_device = engine.get_device(); - auto ze_ctx = engine.get_context(); - OV_ZE_EXPECT(zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr)); - return ze_module; -} - -} // namespace - ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type) : engine(dev) { OPENVINO_ASSERT(runtime_type == runtime_types::ze, "[GPU] Invalid runtime type specified for ZE engine. Only ZE runtime is supported"); @@ -222,58 +201,10 @@ bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) { return (reinterpret_cast(mem1).get_buffer().get() == reinterpret_cast(mem2).get_buffer().get()); } -void ze_engine::build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const { - ze_module_desc_t module_desc = { - ZE_STRUCTURE_TYPE_MODULE_DESC, - nullptr, - ZE_MODULE_FORMAT_NATIVE, - src_bytes, - reinterpret_cast(src), - options.c_str(), - nullptr // specialization constants - }; - switch (src_format) - { - case KernelFormat::SOURCE: - module_desc.format = ze_module_format_oclc; - break; - case KernelFormat::IL: - module_desc.format = ZE_MODULE_FORMAT_IL_SPIRV; - break; - case KernelFormat::NATIVE_BIN: - module_desc.format = ZE_MODULE_FORMAT_NATIVE; - break; - default: - OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format"); - break; - } - ze_module_handle_t module_handle; - ze_module_build_log_handle_t log_handle; - ze_result_t build_result = zeModuleCreate(get_context(), get_device(), &module_desc, &module_handle, &log_handle); - if (build_result != ZE_RESULT_SUCCESS) { - size_t log_size = 0; - OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr)); - std::string log(log_size, ' '); - OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data())); - OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle)); - OPENVINO_THROW(log); - } - auto module_holder = std::make_shared(module_handle); - OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle)); - uint32_t kernel_count = 0; - OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr)); - std::vector kernel_names(kernel_count); - OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data())); - ze_kernel_flags_t flags = 0; - ze_kernel_desc_t kernel_desc = { - ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr}; - for (auto name : kernel_names) { - kernel_desc.pKernelName = name; - ze_kernel_handle_t kernel_handle; - OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle)); - auto kernel_holder = std::make_shared(kernel_handle, module_holder); - out.push_back(std::make_shared(kernel_holder, std::string(name))); - } +std::shared_ptr ze_engine::create_kernel_builder() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return std::make_shared(*casted); } void* ze_engine::get_user_context() const { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp index fb86467fb9cc07..586b830a2de93e 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -41,7 +41,7 @@ class ze_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; stream& get_service_stream() const override; - void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector out) const override; + std::shared_ptr create_kernel_builder() const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp index 61bc03b46f85e2..4944942ab6c201 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -16,12 +16,33 @@ namespace ze { class ze_kernel : public kernel { public: + static void create_kernels_from_module(std::shared_ptr module, std::vector &out) { + ze_module_handle_t module_handle = module->get_module_handle(); + uint32_t kernel_count = 0; + OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr)); + std::vector kernel_names(kernel_count); + // Specification does not mention who is responsible for the returned pointers + // Assume Level Zero owns the pointers and they will remain valid as long as the module resource + OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data())); + + ze_kernel_flags_t flags = 0; + ze_kernel_desc_t kernel_desc = { + ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr}; + for (auto name : kernel_names) { + kernel_desc.pKernelName = name; + ze_kernel_handle_t kernel_handle; + OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle)); + auto kernel_holder = std::make_shared(kernel_handle, module); + out.push_back(std::make_shared(kernel_holder, std::string(name))); + } + } + ze_kernel(std::shared_ptr kernel, const std::string& kernel_id) : m_kernel(kernel) , m_kernel_id(kernel_id) { } - ze_kernel_handle_t get_kernel() { return m_kernel->get_kernel(); } - ze_module_handle_t get_module() { return m_kernel->get_module(); } + ze_kernel_handle_t get_kernel_handle() const { return m_kernel->get_kernel_handle(); } + ze_module_handle_t get_module_handle() const { return m_kernel->get_module()->get_module_handle(); } std::string get_id() const override { return m_kernel_id; } std::shared_ptr clone(bool reuse_kernel_handle = false) const override { @@ -29,20 +50,21 @@ class ze_kernel : public kernel { return std::make_shared(m_kernel, m_kernel_id); } else { ze_kernel_handle_t cloned_handle; - ze_module_handle_t module_handle = m_kernel->get_module(); + ze_module_handle_t module_handle = get_module_handle(); ze_kernel_desc_t descriptor; descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; descriptor.pNext = nullptr; descriptor.flags = 0; descriptor.pKernelName = m_kernel_id.c_str(); OV_ZE_EXPECT(zeKernelCreate(module_handle, &descriptor, &cloned_handle)); - return std::make_shared(cloned_handle, module_handle, m_kernel_id); + auto kernel_holder = std::make_shared(cloned_handle, m_kernel->get_module()); + return std::make_shared(kernel_holder, m_kernel_id); } } std::vector get_binary() const override { size_t binary_size = 0; - ze_module_handle_t module_handle = m_kernel->get_module(); + ze_module_handle_t module_handle = get_module_handle(); OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, nullptr)); std::vector binary(binary_size); @@ -50,6 +72,17 @@ class ze_kernel : public kernel { return binary; } + + std::string get_build_log() const override { + ze_module_build_log_handle_t build_log_handle = m_kernel->get_module()->get_build_log_handle(); + size_t log_size = 0; + OV_ZE_EXPECT(zeModuleBuildLogGetString(build_log_handle, &log_size, nullptr)); + + std::string log(log_size, ' '); + OV_ZE_EXPECT(zeModuleBuildLogGetString(build_log_handle, &log_size, log.data())); + return log; + } + private: std::shared_ptr m_kernel; std::string m_kernel_id; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp new file mode 100644 index 00000000000000..7aae55d6a7f990 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2016-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/kernel_builder.hpp" +#include "intel_gpu/runtime/device.hpp" + +#include "ze_device.hpp" +#include "ze_kernel.hpp" +#include "ze_common.hpp" + +namespace cldnn { +namespace ze { + +class ze_kernel_builder : public kernel_builder{ + public: + ze_kernel_builder(const ze_device &device) : m_device(device) {} + + void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector &out) const override { + ze_module_desc_t module_desc = { + ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_NATIVE, + src_bytes, + reinterpret_cast(src), + options.c_str(), + nullptr // specialization constants + }; + switch (src_format) { + case KernelFormat::SOURCE: { + module_desc.format = ze_module_format_oclc; + break; + } + case KernelFormat::NATIVE_BIN: { + module_desc.format = ZE_MODULE_FORMAT_NATIVE; + break; + } + default: + OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format"); + break; + } + ze_module_handle_t module_handle; + ze_module_build_log_handle_t log_handle; + ze_result_t build_result = zeModuleCreate(m_device.get_context(), m_device.get_device(), &module_desc, &module_handle, &log_handle); + if (build_result != ZE_RESULT_SUCCESS) { + size_t log_size = 0; + OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr)); + std::string log(log_size, ' '); + OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data())); + OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle)); + GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl; + GPU_DEBUG_INFO << log << std::endl; + GPU_DEBUG_INFO << "-------- End of Kernel build error" << std::endl; + OPENVINO_THROW("[GPU] Failed to build module"); + } + auto module_holder = std::make_shared(module_handle, log_handle); + ze_kernel::create_kernels_from_module(module_holder, out); + } + + private: + const ze_device &m_device; +}; +} // namespace ze +} // namespace cldnn + diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp index 72c98c4b9283e7..5d7ab17b6b8367 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp @@ -22,12 +22,11 @@ class ze_kernel_holder { ~ze_kernel_holder() { OV_ZE_WARN(zeKernelDestroy(m_kernel)); } - ze_kernel_handle_t get_kernel() { return m_kernel; } - ze_module_handle_t get_module() { return m_module->get_module(); } + ze_kernel_handle_t get_kernel_handle() { return m_kernel; } + std::shared_ptr get_module() { return m_module; } private: ze_kernel_handle_t m_kernel; std::shared_ptr m_module; -} - +}; } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp index 9288584fc5c681..fcd5c5840b935f 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp @@ -5,6 +5,8 @@ #pragma once #include "ze_common.hpp" +#include +#include namespace cldnn { namespace ze { @@ -12,18 +14,21 @@ namespace ze { // RAII wrapper for Level Zero module class ze_module_holder { public: - // Take ownership of existing module handle - explicit ze_module_holder(ze_module_handle_t handle) : m_handle(handle) {} + // Take ownership of existing module and build log handles + explicit ze_module_holder(ze_module_handle_t module, ze_module_build_log_handle_t build_log) : m_module(module), m_build_log(build_log) {} ze_module_holder(const ze_module_holder& other) = delete; ze_module_holder& operator=(const ze_module_holder& other) = delete; ~ze_module_holder() { - OV_ZE_WARN(zeModuleDestroy(m_handle)); + OV_ZE_WARN(zeModuleBuildLogDestroy(m_build_log)); + OV_ZE_WARN(zeModuleDestroy(m_module)); } - ze_module_handle_t get_module() { return m_handle; } -private: - ze_module_handle_t m_handle; + ze_module_handle_t get_module_handle() const { return m_module; } + ze_module_build_log_handle_t get_build_log_handle() const { return m_build_log; } +private: + ze_module_handle_t m_module; + ze_module_build_log_handle_t m_build_log; }; } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index a16ceb097a46e7..ac081e262383a9 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -234,7 +234,7 @@ void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_ std::lock_guard guard(m); auto& ze_kernel = downcast(kernel); - auto& kern = ze_kernel.get_handle(); + auto kern = ze_kernel.get_kernel_handle(); set_arguments_impl(kern, args_desc.arguments, args); } @@ -245,7 +245,7 @@ event::ptr ze_stream::enqueue_kernel(kernel& kernel, bool is_output) { auto& ze_kernel = downcast(kernel); - auto& kern = ze_kernel.get_handle(); + auto kern = ze_kernel.get_kernel_handle(); std::vector dep_events; std::vector* dep_events_ptr = nullptr; From 9ba3c9cd28d9313d2bbff5e889623cc771b5238e Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 7 Nov 2025 09:40:03 +0000 Subject: [PATCH 45/74] Fix style --- src/common/util/include/openvino/util/file_util.hpp | 2 +- src/common/util/src/file_util.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/util/include/openvino/util/file_util.hpp b/src/common/util/include/openvino/util/file_util.hpp index 88407538a36cc8..bfb60e8a126b82 100644 --- a/src/common/util/include/openvino/util/file_util.hpp +++ b/src/common/util/include/openvino/util/file_util.hpp @@ -316,7 +316,7 @@ std::vector load_binary(const std::string& path); * @brief save binary data to file * @param path - binary file path to store */ -void save_binary(const std::string& path, const std::vector &binary); +void save_binary(const std::string& path, const std::vector& binary); void save_binary(const std::string& path, const char* binary, size_t bin_size); /** diff --git a/src/common/util/src/file_util.cpp b/src/common/util/src/file_util.cpp index db39f3a61749d2..22dd2daf3ca75f 100644 --- a/src/common/util/src/file_util.cpp +++ b/src/common/util/src/file_util.cpp @@ -515,7 +515,7 @@ std::vector ov::util::load_binary(const std::string& path) { return {}; } -void ov::util::save_binary(const std::string& path, const std::vector &binary) { +void ov::util::save_binary(const std::string& path, const std::vector& binary) { save_binary(path, reinterpret_cast(&binary[0]), binary.size()); return; } From d0f157c9e54162b53e683229f653ad02426331ac Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 7 Nov 2025 16:05:19 +0000 Subject: [PATCH 46/74] Remove OpenCL dependency when running L0 * removed opencl from targets: openvino_intel_gpu_kernels, openvino_intel_gpu_runtime * opencl will still be linked to openvino gpu plugin even with L0 rt --- src/plugins/intel_gpu/CMakeLists.txt | 2 +- src/plugins/intel_gpu/cmake/utils.cmake | 9 ++++----- .../intel_gpu/include/intel_gpu/runtime/event.hpp | 10 ++++++++++ .../intel_gpu/include/intel_gpu/runtime/memory.hpp | 2 -- .../intel_gpu/include/intel_gpu/runtime/stream.hpp | 3 ++- .../src/graph/impls/onednn/primitive_onednn_base.h | 11 ++++------- src/plugins/intel_gpu/src/graph/network.cpp | 6 +++--- .../intel_gpu/src/kernel_selector/CMakeLists.txt | 2 +- src/plugins/intel_gpu/src/runtime/CMakeLists.txt | 7 +++++-- src/plugins/intel_gpu/src/runtime/device_query.cpp | 2 ++ src/plugins/intel_gpu/src/runtime/engine.cpp | 2 ++ src/plugins/intel_gpu/src/runtime/memory.cpp | 13 ------------- src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp | 10 ---------- src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp | 5 ----- .../intel_gpu/src/runtime/ocl/ocl_stream.cpp | 5 +++++ .../intel_gpu/src/runtime/ocl/ocl_stream.hpp | 1 + src/plugins/intel_gpu/src/runtime/stream.cpp | 11 ----------- .../intel_gpu/src/runtime/ze/ze_engine_factory.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 7 ++++++- src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp | 1 + 21 files changed, 49 insertions(+), 64 deletions(-) diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 38e3dafbc3305f..531047df7dbf10 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -77,7 +77,7 @@ ov_add_plugin(NAME ${TARGET_NAME} target_compile_options(${TARGET_NAME} PRIVATE $<$:$,/Os,-Os>>) -target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph openvino::pugixml) +target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL openvino_intel_gpu_graph openvino::pugixml) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include/) diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake index 5b61368a5d4742..f346f50f8792cf 100644 --- a/src/plugins/intel_gpu/cmake/utils.cmake +++ b/src/plugins/intel_gpu/cmake/utils.cmake @@ -4,12 +4,11 @@ function(ov_gpu_set_runtime_interface_for TARGET_NAME) if(GPU_RT_TYPE STREQUAL "L0") - target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_ZE_RT=1) - target_link_libraries(${TARGET_NAME} PUBLIC LevelZero::LevelZero) + target_compile_definitions(${TARGET_NAME} PRIVATE OV_GPU_WITH_ZE_RT=1) + target_link_libraries(${TARGET_NAME} PRIVATE LevelZero::LevelZero) elseif(GPU_RT_TYPE STREQUAL "OCL") - target_compile_definitions(${TARGET_NAME} PUBLIC OV_GPU_WITH_OCL_RT=1) - # Do not link OpenCL as It is already linked to the targets that require it - # target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::NewHeaders OpenCL::OpenCL) + target_compile_definitions(${TARGET_NAME} PRIVATE OV_GPU_WITH_OCL_RT=1) + target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL) else() message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported") endif() diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp index dffa64173fe72c..5451cd47c7daf2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace cldnn { struct user_event; @@ -32,6 +33,15 @@ struct event { _profiling_captured = false; _profiling_info.clear(); } + // Set event profiling data instead of retrieving it from event object + void set_profiling(uint64_t duration_nsec) { + auto stage = instrumentation::profiling_stage::executing; + auto duration = std::chrono::nanoseconds(duration_nsec); + auto period = std::make_shared(duration); + + _profiling_info.push_back({ stage, period }); + _profiling_captured = true; + } // returns true if handler has been successfully added bool add_event_handler(event_handler handler, void* data); diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index 67843abbe86ec0..4e0a1191982c0b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -223,8 +223,6 @@ struct surfaces_lock { surfaces_lock(const surfaces_lock& other) = delete; surfaces_lock& operator=(const surfaces_lock& other) = delete; - - static std::unique_ptr create(engine_types engine_type, std::vector mem, const stream& stream); }; template diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp index 1e8300f92135e7..98f6d87066ac34 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp @@ -17,6 +17,7 @@ #endif namespace cldnn { +struct surfaces_lock; // Possible sync methods for kernels in stream enum class SyncMethods { @@ -66,12 +67,12 @@ class stream { virtual void wait_for_events(const std::vector& events) = 0; virtual event::ptr create_user_event(bool set) = 0; virtual event::ptr create_base_event() = 0; + virtual std::unique_ptr create_surfaces_lock(const std::vector &mem) const = 0; virtual event::ptr aggregate_events(const std::vector& events, bool group = false, bool is_output = false); QueueTypes get_queue_type() const { return m_queue_type; } SyncMethods get_sync_method() const { return m_sync_method; } - static QueueTypes detect_queue_type(engine_types engine_type, void* queue_handle); static SyncMethods get_expected_sync_method(const ExecutionConfig& config); #ifdef ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index 38034280659e0e..122cb04d8c7640 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -12,7 +12,6 @@ #include "intel_gpu/runtime/file_util.hpp" #include "to_string_utils.h" #include "utils.hpp" -#include "runtime/ocl/ocl_event.hpp" #include "intel_gpu/primitives/reorder.hpp" @@ -544,8 +543,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { try { _prim.execute(stream.get_onednn_stream(), _args[net_id]); } catch (dnnl::error& err) { - auto err_code = err.status == dnnl_status_t::dnnl_out_of_memory ? CL_OUT_OF_RESOURCES : CL_INVALID_OPERATION; - ocl::rethrow(err.what(), err_code, _engine->get_device_info()); + OPENVINO_THROW(err.what()); } if (_enable_profiling) { @@ -554,12 +552,11 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { stream.wait(); std::vector duration = dnnl::get_profiling_data(stream.get_onednn_stream(), dnnl::profiling_data_kind::time); - if (duration.empty()) { - event = std::make_shared(0); - } else { + event = stream.create_user_event(true); + if (!duration.empty()) { OPENVINO_ASSERT(duration.size() == 1, "[GPU] oneDNN profiling data is expected to have info only for single primitive ", "actual number is ", duration.size()); - event = std::make_shared(duration[0]); + event->set_profiling(duration[0]); } } else { diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 72310ad7ba1c28..36fce1bc89b44a 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -721,13 +721,13 @@ std::map network::execute(const std::vector) diff --git a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt index 384ee9c0bef7a5..997dcc617791f2 100644 --- a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt @@ -27,7 +27,11 @@ if(NOT OV_COMPILER_IS_INTEL_LLVM) list(REMOVE_ITEM LIBRARY_SOURCES_OCL ${SYCL_SOURCES}) endif() -set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN} ${LIBRARY_SOURCES_OCL}) +set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN}) + +if(GPU_RT_TYPE STREQUAL "OCL") + list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_OCL}) +endif() if(GPU_RT_TYPE STREQUAL "L0") list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_ZE}) @@ -69,7 +73,6 @@ ov_set_threading_interface_for(${TARGET_NAME}) ov_gpu_set_runtime_interface_for(${TARGET_NAME}) target_link_libraries(${TARGET_NAME} PRIVATE - OpenCL::OpenCL openvino::itt openvino::runtime::dev ) diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp index 8a0ee71944483c..7ec5d6d90edabe 100644 --- a/src/plugins/intel_gpu/src/runtime/device_query.cpp +++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp @@ -18,12 +18,14 @@ device_query::device_query(engine_types engine_type, int target_tile_id, bool initialize_devices) { switch (runtime_type) { +#ifdef OV_GPU_WITH_OCL_RT case runtime_types::ocl: { OPENVINO_ASSERT(engine_type == engine_types::ocl || engine_type == engine_types::sycl); ocl::ocl_device_detector ocl_detector; _available_devices = ocl_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices); break; } +#endif #ifdef OV_GPU_WITH_ZE_RT case runtime_types::ze: { OPENVINO_ASSERT(engine_type == engine_types::ze); diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index ea7820f57353be..f35d4d577230f4 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -263,9 +263,11 @@ std::shared_ptr engine::create(engine_types engine_type, runtime_ ret = ocl::create_sycl_engine(device, runtime_type); break; #endif // OV_GPU_WITH_SYCL +#ifdef OV_GPU_WITH_OCL_RT case engine_types::ocl: ret = ocl::create_ocl_engine(device, runtime_type); break; +#endif #ifdef OV_GPU_WITH_ZE_RT case engine_types::ze: ret = ze::create_ze_engine(device, runtime_type); diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp index a720ee7ec4ea31..f61aa202b9b4bc 100644 --- a/src/plugins/intel_gpu/src/runtime/memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/memory.cpp @@ -7,8 +7,6 @@ #include "intel_gpu/runtime/stream.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" -#include "ocl/ocl_memory.hpp" - #include #include #include @@ -45,15 +43,4 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, std:: : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), m_mem_tracker(mem_tracker), _type(type) { } -std::unique_ptr surfaces_lock::create(engine_types engine_type, std::vector mem, const stream& stream) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: - return std::unique_ptr(new ocl::ocl_surfaces_lock(mem, stream)); - case engine_types::ze: - return nullptr; // TODO: implement once we have support for surface sharing - default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create"); - } -} - } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp index 29a27e5ea6acee..26fbe6fd272487 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp @@ -100,16 +100,6 @@ static const std::vector profiling_periods{ }; bool ocl_event::get_profiling_info_impl(std::list& info) { - if (duration_nsec.has_value()) { - auto stage = instrumentation::profiling_stage::executing; - auto duration = std::chrono::nanoseconds(duration_nsec.value()); - auto period = std::make_shared(duration); - - info.push_back({ stage, period }); - - return true; - } - if (!is_event_profiled(_event)) return true; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp index d51b7de50167b1..7efb87d8775405 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp @@ -24,10 +24,6 @@ struct ocl_event : public ocl_base_event { : ocl_base_event(queue_stamp) , _event(ev) {} - ocl_event(uint64_t duration_nsec, uint64_t queue_stamp = 0) - : ocl_base_event(queue_stamp) - , duration_nsec(duration_nsec) {} - cl::Event& get() override { return _event; } private: @@ -45,7 +41,6 @@ struct ocl_event : public ocl_base_event { protected: cl::Event _event; - std::optional duration_nsec; }; struct ocl_events : public ocl_base_event { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp index a0cb17ba40f615..a7a02d69f205f7 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp @@ -11,6 +11,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "ocl_kernel.hpp" #include "ocl_common.hpp" +#include "ocl_memory.hpp" #include #include @@ -366,6 +367,10 @@ event::ptr ocl_stream::create_base_event() { return std::make_shared(ret_ev, ++_queue_counter); } +std::unique_ptr ocl_stream::create_surfaces_lock(const std::vector &mem) const { + return std::unique_ptr(new ocl::ocl_surfaces_lock(mem, *this)); +} + void ocl_stream::flush() const { try { get_cl_queue().flush(); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp index b9c51ccb046508..c86089bce08b46 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp @@ -47,6 +47,7 @@ class ocl_stream : public stream { void enqueue_barrier() override; event::ptr create_user_event(bool set) override; event::ptr create_base_event() override; + std::unique_ptr create_surfaces_lock(const std::vector &mem) const override; const cl::UsmHelper& get_usm_helper() const { return _engine.get_usm_helper(); } diff --git a/src/plugins/intel_gpu/src/runtime/stream.cpp b/src/plugins/intel_gpu/src/runtime/stream.cpp index 913d84d8f476f5..2bd8a74f857a4d 100644 --- a/src/plugins/intel_gpu/src/runtime/stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/stream.cpp @@ -4,21 +4,10 @@ #include "intel_gpu/runtime/stream.hpp" -#include "ocl/ocl_stream.hpp" - #include namespace cldnn { -QueueTypes stream::detect_queue_type(engine_types engine_type, void* queue_handle) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: - return ocl::ocl_stream::detect_queue_type(queue_handle); - default: throw std::runtime_error("Invalid engine type"); - } -} - SyncMethods stream::get_expected_sync_method(const ExecutionConfig& config) { auto profiling = config.get_enable_profiling(); auto queue_type = config.get_queue_type(); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp index 40c944a1ca7512..abd2946a2d8e56 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp @@ -10,7 +10,7 @@ namespace cldnn { namespace ze { -// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible CL includes conflict +// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible L0 includes conflict // between different engines in engine.cpp file std::shared_ptr create_ze_engine(const device::ptr device, runtime_types runtime_type); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp index 680753d6409084..9fadd2c6f1a06b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp @@ -26,7 +26,7 @@ struct ze_events : public ze_base_event { } std::optional query_timestamp() override { return std::nullopt; } - ze_event_handle_t get_handle() const { return m_last_event; } + ze_event_handle_t get_handle() const override { return m_last_event; } bool get_profiling_info_impl(std::list& info) override; protected: diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index ac081e262383a9..920c0ac1aa0c83 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -334,8 +334,13 @@ event::ptr ze_stream::create_base_event() { return m_ev_factory->create_event(++m_queue_counter); } +std::unique_ptr ze_stream::create_surfaces_lock(const std::vector &mem) const { + // Level Zero egnine currently does not support surfaces lock + return nullptr; +} + void ze_stream::flush() const { - //Immediate Command List submits commands immediately - no flush impl + // Immediate Command List submits commands immediately - no flush impl } void ze_stream::finish() const { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp index 03e9c12a578a9a..06647fda325304 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -48,6 +48,7 @@ class ze_stream : public stream { void enqueue_barrier() override; event::ptr create_user_event(bool set) override; event::ptr create_base_event() override; + std::unique_ptr create_surfaces_lock(const std::vector &mem) const override; #ifdef ENABLE_ONEDNN_FOR_GPU dnnl::stream& get_onednn_stream() override; From 4ec431e29e25573f2cc5654301e5496ca0f83f1c Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 12 Nov 2025 09:58:48 +0000 Subject: [PATCH 47/74] Fix OCL program build --- src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp index 7ded6cf5b8e837..97b624e4915b47 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp @@ -51,7 +51,7 @@ class ocl_kernel_builder : public kernel_builder{ OPENVINO_THROW("[GPU] Failed to create program during kernel build process"); } cl::Program program(program_handle); - if (program.build(m_device.get_device(), options.c_str()) != CL_SUCCESS) { + if (program.build({m_device.get_device()}, options.c_str()) != CL_SUCCESS) { GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl; auto log = program.getBuildInfo(); for (auto &e : log) { From 3e6b368b585c1087a23b1c1db75e12fae5245831 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 17 Nov 2025 15:33:16 +0000 Subject: [PATCH 48/74] Add gfx_ver parsing for L0 --- .../intel_gpu/src/runtime/ze/ze_device.cpp | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index ad37f85a4c440b..ef09a71950d266 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -55,6 +55,32 @@ gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { } #endif +gfx_version parse_version(uint32_t gmdid) { + union GMDID { + uint32_t value; + struct { + uint32_t revision : 6; + uint32_t reserved : 8; + uint32_t release : 8; + uint32_t architecture : 10; + }; + }; + + GMDID gmd_id = {gmdid}; + if (gmd_id.architecture > 0 && gmd_id.architecture < 100) { + // New format + return { static_cast(gmd_id.architecture), static_cast(gmd_id.release), static_cast(gmd_id.revision)}; + } else { + // Old format + uint32_t ver = gmdid; + uint16_t major = ver >> 16; + uint8_t minor = (ver >> 8) & 0xFF; + uint8_t revision = ver & 0xFF; + + return {major, minor, revision}; + } +} + bool supports_extension(const std::vector& extensions, const std::string& ext_name, uint32_t ext_ver) { return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) { return std::string(ep.name) == ext_name && ep.version == ext_ver; @@ -190,7 +216,6 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; // FIXME: Could not find how to retrieve those from L0 - info.gfx_ver = {0, 0, 0}; info.supports_work_group_collective_functions = false; info.supports_intel_planar_yuv = false; info.supports_khr_subgroups = true; @@ -200,7 +225,10 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_intel_required_subgroup_size = true; info.supports_queue_families = true; - info.ip_version = ip_version_properties.ipVersion; + if (supports_ip_version) { + info.ip_version = ip_version_properties.ipVersion; + info.gfx_ver = parse_version(ip_version_properties.ipVersion); + } info.sub_device_idx = (std::numeric_limits::max)(); info.device_id = device_properties.deviceId; From cfdbb0263b064dcdff1d24f33f730971c1960a6e Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 18 Nov 2025 16:38:43 +0000 Subject: [PATCH 49/74] Skip L0 symbol table kernel --- src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp index 4944942ab6c201..3a726e24930d24 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -28,12 +28,17 @@ class ze_kernel : public kernel { ze_kernel_flags_t flags = 0; ze_kernel_desc_t kernel_desc = { ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr}; - for (auto name : kernel_names) { - kernel_desc.pKernelName = name; + for (auto name_cstr : kernel_names) { + auto name = std::string(name_cstr); + // L0 returns Intel_Symbol_Table_Void_Program that does not correspond to actual kernel + if (name == "Intel_Symbol_Table_Void_Program") { + continue; + } + kernel_desc.pKernelName = name_cstr; ze_kernel_handle_t kernel_handle; OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle)); auto kernel_holder = std::make_shared(kernel_handle, module); - out.push_back(std::make_shared(kernel_holder, std::string(name))); + out.push_back(std::make_shared(kernel_holder, name)); } } From 04b086a79ad4bbe03ecb15c7e74663455041079b Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 20 Nov 2025 13:44:53 +0000 Subject: [PATCH 50/74] Avoid cl_mem usage with L0 engine --- .../include/intel_gpu/runtime/memory_caps.hpp | 13 ++++++++ .../intel_gpu/src/plugin/remote_tensor.cpp | 9 ++++- .../intel_gpu/src/runtime/ze/ze_engine.cpp | 4 +-- .../intel_gpu/src/runtime/ze/ze_memory.hpp | 33 ++++++++----------- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp index 78d4b99b32f99a..4d04792bae0bb0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp @@ -81,6 +81,19 @@ enum class shared_mem_type { shared_mem_usm }; +inline std::ostream& operator<<(std::ostream& out, const shared_mem_type& mem_type) { + switch (mem_type) { + case shared_mem_type::shared_mem_empty: out << "shared_mem_empty"; break; + case shared_mem_type::shared_mem_buffer: out << "shared_mem_buffer"; break; + case shared_mem_type::shared_mem_image: out << "shared_mem_image"; break; + case shared_mem_type::shared_mem_vasurface: out << "shared_mem_vasurface"; break; + case shared_mem_type::shared_mem_dxbuffer: out << "shared_mem_dxbuffer"; break; + case shared_mem_type::shared_mem_usm: out << "shared_mem_usm"; break; + default: out << "unknown"; break; + } + return out; +} + using shared_handle = void*; using shared_surface = uint32_t; diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp index c2524c8ab3d1a9..b62a95f5f2bd62 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp @@ -317,7 +317,14 @@ void RemoteTensorImpl::allocate() { switch (m_mem_type) { case TensorType::BT_BUF_INTERNAL: { - m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset); + // BT_BUF_INTERNAL should map to cl_mem however L0 engine can not allocate cl_mem + if (engine.supports_allocation(cldnn::allocation_type::cl_mem)) { + m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset); + } else { + // Fall back to usm_host and override memory type + m_mem_type = TensorType::BT_USM_HOST_INTERNAL; + m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset); + } break; } case TensorType::BT_USM_HOST_INTERNAL: { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 45c453009454e9..6b64c617560ce1 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -153,7 +153,7 @@ memory::ptr ze_engine::reinterpret_buffer(const memory& memory, const layout& ne memory.get_mem_tracker()); } - return nullptr; + OPENVINO_THROW("[GPU] Trying to reinterpret non usm buffer"); } memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_params params) { @@ -167,7 +167,7 @@ memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_p ") than specified layout (", requested_mem_size, ")"); return std::make_shared(this, new_layout, usm_buffer, nullptr); } else { - return nullptr; + OPENVINO_THROW("[GPU] Unsupported shared memory type: ", params.mem_type); } } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp index 9089e8a711758b..c76d6c46cc31fe 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp @@ -26,16 +26,19 @@ struct lockable_gpu_mem { class UsmHolder { public: - UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) { } + UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) { + if (ptr == nullptr) + OPENVINO_THROW("[GPU] Can not create UsmHolder with nullptr"); + } + UsmHolder(const UsmHolder&) = delete; + UsmHolder& operator=(const UsmHolder&) = delete; + void* ptr() { return _ptr; } void memFree() { - try { - if (!_shared_memory) - zeMemFree(_context, _ptr); - } catch (...) { - // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly + if (!_shared_memory && _ptr != nullptr) { + OV_ZE_WARN(zeMemFree(_context, _ptr)); + _ptr = nullptr; } - _ptr = nullptr; } ~UsmHolder() { @@ -58,7 +61,6 @@ class UsmMemory { , _device(device) , _usm_pointer(std::make_shared(_context, reinterpret_cast(usm_ptr) + offset, true)) {} - // Get methods returns original pointer allocated by openCL. void* get() const { return _usm_pointer->ptr(); } void allocateHost(size_t size) { @@ -69,7 +71,7 @@ class UsmMemory { void* memory = nullptr; OV_ZE_EXPECT(zeMemAllocHost(_context, &host_desc, size, 1, &memory)); - _allocate(memory); + _usm_pointer = std::make_shared(_context, memory); } void allocateShared(size_t size, uint32_t ordinal) { @@ -86,7 +88,7 @@ class UsmMemory { void* memory = nullptr; OV_ZE_EXPECT(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory)); - _allocate(memory); + _usm_pointer = std::make_shared(_context, memory); } void allocateDevice(size_t size, uint32_t ordinal) { @@ -98,12 +100,12 @@ class UsmMemory { void* memory = nullptr; OV_ZE_EXPECT(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory)); - _allocate(memory); + _usm_pointer = std::make_shared(_context, memory); } void freeMem() { if (!_usm_pointer) - throw std::runtime_error("[CL ext] Can not free memory of empty UsmHolder"); + OPENVINO_THROW("[GPU] Can not free memory of empty UsmHolder"); _usm_pointer->memFree(); } @@ -113,13 +115,6 @@ class UsmMemory { ze_context_handle_t _context; ze_device_handle_t _device; std::shared_ptr _usm_pointer = nullptr; - -private: - void _allocate(void* ptr) { - if (!ptr) - throw std::runtime_error("[CL ext] Can not allocate nullptr for USM type."); - _usm_pointer = std::make_shared(_context, ptr); - } }; struct gpu_usm : public lockable_gpu_mem, public memory { From 6c6664d6f0cbccb01a50988749e2eea12ca97dfa Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 24 Nov 2025 21:40:45 +0100 Subject: [PATCH 51/74] Fix windows build --- src/plugins/intel_gpu/CMakeLists.txt | 2 +- .../intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp | 4 ++++ src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp | 1 - src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt | 1 - src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 6 ++++++ src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +- 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 531047df7dbf10..38e3dafbc3305f 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -77,7 +77,7 @@ ov_add_plugin(NAME ${TARGET_NAME} target_compile_options(${TARGET_NAME} PRIVATE $<$:$,/Os,-Os>>) -target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL openvino_intel_gpu_graph openvino::pugixml) +target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph openvino::pugixml) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include/) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp index 66b16345984bc9..9d8bd8aa410e00 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp @@ -8,11 +8,15 @@ # define NOMINMAX #endif + +// Do not include DirectX / VA wrappers when running with L0 runtime as they depend on OCL +#ifndef OV_GPU_WITH_ZE_RT #ifdef _WIN32 # include #else # include #endif +#endif #include "openvino/runtime/iremote_tensor.hpp" #include "intel_gpu/runtime/memory_caps.hpp" diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index 4420f00deae4c9..cdb486c13cbb9d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -5,7 +5,6 @@ #include "utils.hpp" #include #include -#include namespace cldnn { namespace onednn { diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt index fe8582b1a0449c..76652d40e77c1d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt @@ -75,7 +75,6 @@ if(COMMAND add_cpplint_target) endif() target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE openvino::runtime::dev) -ov_gpu_set_runtime_interface_for(${TARGET_NAME}) target_include_directories(${TARGET_NAME} PRIVATE $) diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index d876d52f97baed..916559e401be4c 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -166,6 +166,12 @@ if(ENABLE_ONEDNN_FOR_GPU) "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config" "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include" "${ONEDNN_GPU_DIR}/third_party/ngen") + if(GPU_RUNTIME STREQUAL "L0") + # OneDNN LevelZero headers can be found at ${ONEDNN_GPU_DIR}\third_party\level_zero\ + # However adding ${ONEDNN_GPU_DIR}\third_party to include dirs will clash with other dependencies like gtest + # Workaround: Use OpenVINO LevelZero target instead + list(APPEND LIB_INCLUDE_DIRS $) + endif() set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU DNNL_DLL DNNL_DLL_EXPORTS diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu index bbba69f0a58439..5d3bb07d57b485 160000 --- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -1 +1 @@ -Subproject commit bbba69f0a584391d0ab25b548ec76d8c62aa11f5 +Subproject commit 5d3bb07d57b4853d3d9643a11117a4d7c5a380f2 From a3eac100163d7d98d879630cb640a08b75319096 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 27 Nov 2025 15:48:48 +0000 Subject: [PATCH 52/74] Remove unnecessary vector during L0 mem fill --- src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 54482661e52705..d86d1fe3db8fc2 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -143,17 +143,16 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vecto auto& _ze_stream = downcast(stream); auto ev = _ze_stream.create_base_event(); auto ev_ze = downcast(ev.get())->get_handle(); - std::vector temp_buffer(_bytes_count, pattern); auto ze_dep_events = get_ze_events(dep_events); OV_ZE_EXPECT(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), _buffer.get(), - temp_buffer.data(), - 1, + &pattern, + sizeof(unsigned char), _bytes_count, ev_ze, ze_dep_events.size(), ze_dep_events.data())); - + // FIXME: when not blocking pattern goes out of scope if (blocking) { ev->wait(); } From c76cdb2a741f6632237203f519ec0057729d67fb Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 28 Nov 2025 12:07:09 +0000 Subject: [PATCH 53/74] Fix memory --- .../intel_gpu/include/intel_gpu/runtime/memory.hpp | 1 - src/plugins/intel_gpu/src/runtime/memory.cpp | 9 --------- 2 files changed, 10 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index cdb15dbace42d7..9cd00b1c3065e5 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -224,7 +224,6 @@ struct surfaces_lock { surfaces_lock(const surfaces_lock& other) = delete; surfaces_lock& operator=(const surfaces_lock& other) = delete; - static std::unique_ptr create(engine_types engine_type, std::vector mem, const stream& stream); static bool is_lock_needed(const shared_mem_type& mem_type); }; diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp index 9718a9b0e35d24..46904030b6ee52 100644 --- a/src/plugins/intel_gpu/src/runtime/memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/memory.cpp @@ -43,15 +43,6 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, std:: : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), m_mem_tracker(mem_tracker), _type(type) { } -std::unique_ptr surfaces_lock::create(engine_types engine_type, std::vector mem, const stream& stream) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: - return std::unique_ptr(new ocl::ocl_surfaces_lock(mem, stream)); - default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create"); - } -} - bool surfaces_lock::is_lock_needed(const shared_mem_type& mem_type) { return mem_type == shared_mem_type::shared_mem_vasurface || mem_type == shared_mem_type::shared_mem_dxbuffer || From 6473aa0684ad7bf869b264212f3b9c4f1830a47e Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 28 Nov 2025 14:08:53 +0000 Subject: [PATCH 54/74] Set runtime interface for kernel_selector --- src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt index 196f56acd4cc4a..fb82eea70f9d23 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt @@ -71,6 +71,7 @@ if (ENABLE_ONEDNN_FOR_GPU) endif() target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE openvino::runtime::dev) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) target_include_directories(${TARGET_NAME} PRIVATE $) From 93bb04b43d62bd788c47ba62f9eb4bf89710f5cd Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 1 Dec 2025 17:15:08 +0100 Subject: [PATCH 55/74] Remove onednn ocl include --- .../src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 34c9f2a51f29c1..19fed9fec49402 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -7,7 +7,6 @@ #ifdef ENABLE_ONEDNN_FOR_GPU # include # include -# include # include # include # include From c621131e8a128ab7847b5128eaeb4b402a673176 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 2 Dec 2025 12:52:44 +0000 Subject: [PATCH 56/74] Fix backend compilation and remote tensor --- src/plugins/intel_gpu/src/graph/CMakeLists.txt | 2 ++ src/plugins/intel_gpu/src/plugin/remote_tensor.cpp | 1 + src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 1 + src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 6 ------ 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/CMakeLists.txt index 3cbc144a296238..9abdd2705f4abd 100644 --- a/src/plugins/intel_gpu/src/graph/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/graph/CMakeLists.txt @@ -84,6 +84,8 @@ macro(ov_gpu_add_backend_target) target_include_directories(${ARG_NAME} SYSTEM BEFORE PRIVATE $) add_dependencies(openvino_intel_gpu_${IMPL_TYPE}_obj onednn_gpu_tgt) endif() + # Onednn headers use OCL/L0 headers + ov_gpu_set_runtime_interface_for(openvino_intel_gpu_${IMPL_TYPE}_obj) endmacro() set(CODEGEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/common_utils/kernels_db_gen.py") diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp index b62a95f5f2bd62..9faecbbb6842e4 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp @@ -322,6 +322,7 @@ void RemoteTensorImpl::allocate() { m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset); } else { // Fall back to usm_host and override memory type + GPU_DEBUG_COUT << "[Warning] [GPU] Could not allocate cl_mem, using usm_host allocation instead\n"; m_mem_type = TensorType::BT_USM_HOST_INTERNAL; m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset); } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index c7e1498a2e3d30..fcb6004458e940 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -386,6 +386,7 @@ bool does_device_support(int32_t param, const cl::Device& device) { memory_capabilities init_memory_caps(const cl::Device& device, const device_info& info) { std::vector memory_caps; + memory_caps.push_back(allocation_type::cl_mem); if (info.supports_usm) { if (does_device_support(CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, device)) { memory_caps.push_back(allocation_type::usm_host); diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index 916559e401be4c..d876d52f97baed 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -166,12 +166,6 @@ if(ENABLE_ONEDNN_FOR_GPU) "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config" "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include" "${ONEDNN_GPU_DIR}/third_party/ngen") - if(GPU_RUNTIME STREQUAL "L0") - # OneDNN LevelZero headers can be found at ${ONEDNN_GPU_DIR}\third_party\level_zero\ - # However adding ${ONEDNN_GPU_DIR}\third_party to include dirs will clash with other dependencies like gtest - # Workaround: Use OpenVINO LevelZero target instead - list(APPEND LIB_INCLUDE_DIRS $) - endif() set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU DNNL_DLL DNNL_DLL_EXPORTS From 8c9e85581cf09bba37011cf9d31f64d97c55ae09 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 7 Jan 2026 14:47:39 +0000 Subject: [PATCH 57/74] Rename counter based events --- .../include/intel_gpu/runtime/device_info.hpp | 2 +- .../intel_gpu/src/runtime/ocl/ocl_device.cpp | 2 +- ...e_cb_event.cpp => ze_counter_based_event.cpp} | 16 ++++++++-------- ...e_cb_event.hpp => ze_counter_based_event.hpp} | 13 ++++++------- ...ry.cpp => ze_counter_based_event_factory.cpp} | 10 +++++----- ...ry.hpp => ze_counter_based_event_factory.hpp} | 4 ++-- .../intel_gpu/src/runtime/ze/ze_device.cpp | 4 ++-- .../intel_gpu/src/runtime/ze/ze_stream.cpp | 6 +++--- 8 files changed, 28 insertions(+), 29 deletions(-) rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event.cpp => ze_counter_based_event.cpp} (78%) rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event.hpp => ze_counter_based_event.hpp} (73%) rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event_factory.cpp => ze_counter_based_event_factory.cpp} (75%) rename src/plugins/intel_gpu/src/runtime/ze/{ze_cb_event_factory.hpp => ze_counter_based_event_factory.hpp} (70%) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 6b6c0d3caf48f0..94e6ff2605a100 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -122,7 +122,7 @@ struct device_info { bool has_separate_cache; ///< Does the target hardware has separate cache for usm_device and usm_host bool supports_cp_offload; ///< [L0] Does the command queue support copy offload - bool supports_cb_events; ///< [L0] Does the target runtime support counter based events + bool supports_counter_based_events; ///< [L0] Does the target runtime support counter based events std::vector supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index fcb6004458e940..d935b6ad900e54 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -352,7 +352,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex info.compute_queue_group_ordinal = 0; info.device_memory_ordinal = 0; info.supports_cp_offload = false; - info.supports_cb_events = false; + info.supports_counter_based_events = false; #if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) using namespace dnnl::impl::gpu::intel::jit; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp similarity index 78% rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp rename to src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp index 3746c92fe00bbc..0ca4b0b243d2e0 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ze_cb_event.hpp" +#include "ze_counter_based_event.hpp" #include "ze/ze_common.hpp" #include @@ -12,15 +12,15 @@ using namespace cldnn; using namespace ze; -void ze_cb_event::wait_impl() { +void ze_counter_based_event::wait_impl() { OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout)); } -void ze_cb_event::set_impl() { +void ze_counter_based_event::set_impl() { // Counter based events start in signaled state and can not be signaled from host } -bool ze_cb_event::is_set_impl() { +bool ze_counter_based_event::is_set_impl() { auto ret = zeEventQueryStatus(m_event); switch (ret) { case ZE_RESULT_SUCCESS: @@ -35,11 +35,11 @@ bool ze_cb_event::is_set_impl() { } } -ze_event_handle_t ze_cb_event::get_handle() const { +ze_event_handle_t ze_counter_based_event::get_handle() const { return m_event; } -std::optional ze_cb_event::query_timestamp() { +std::optional ze_counter_based_event::query_timestamp() { if (!m_factory.is_profiling_enabled()) { return std::nullopt; } @@ -48,7 +48,7 @@ std::optional ze_cb_event::query_timestamp() { return timestamp; } -bool ze_cb_event::get_profiling_info_impl(std::list& info) { +bool ze_counter_based_event::get_profiling_info_impl(std::list& info) { auto opt_timestamp = query_timestamp(); if (!opt_timestamp.has_value()) { return true; @@ -67,6 +67,6 @@ bool ze_cb_event::get_profiling_info_impl(std::list(queue_stamp, *this, event); + auto cb_event = std::make_shared(queue_stamp, *this, event); return cb_event; } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp similarity index 70% rename from src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp rename to src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp index 0c6bceaacbd57b..90e7af9562628a 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_cb_event_factory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp @@ -11,9 +11,9 @@ namespace ze { // Interface for creating l0 counter based events // Should only be used with in-order queue -struct ze_cb_event_factory : public ze_base_event_factory { +struct ze_counter_based_event_factory : public ze_base_event_factory { public: - ze_cb_event_factory(const ze_engine &engine, bool enable_profiling); + ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling); event::ptr create_event(uint64_t queue_stamp) override; }; } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp index ef09a71950d266..e6f965d6416929 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -103,7 +103,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0); bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0); bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0); - bool supports_cb_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0); + bool supports_counter_based_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0); bool supports_cp_offload = supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0); bool supports_dp_properties = @@ -208,7 +208,7 @@ device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t devic info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0; info.supports_cp_offload = supports_cp_offload; - info.supports_cb_events = supports_cb_events; + info.supports_counter_based_events = supports_counter_based_events; info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0; info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 920c0ac1aa0c83..98bc5358f8a417 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -9,7 +9,7 @@ #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/properties.hpp" -#include "ze_cb_event_factory.hpp" +#include "ze_counter_based_event_factory.hpp" #include "ze_event_factory.hpp" #include "ze_events.hpp" #include "ze_empty_event.hpp" @@ -213,8 +213,8 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) } OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); - if (m_queue_type == QueueTypes::in_order && info.supports_cb_events) { - m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); + if (m_queue_type == QueueTypes::in_order && info.supports_counter_based_events) { + m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } else { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } From d377c4523c6bdbbbf867edff090d33029de8ab8c Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 7 Jan 2026 14:58:44 +0000 Subject: [PATCH 58/74] Adjust event set_profiling_duration --- src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp | 6 +++--- .../src/graph/impls/onednn/primitive_onednn_base.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp index 5451cd47c7daf2..24465c4f2d0569 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp @@ -11,7 +11,6 @@ #include #include #include -#include namespace cldnn { struct user_event; @@ -33,8 +32,9 @@ struct event { _profiling_captured = false; _profiling_info.clear(); } - // Set event profiling data instead of retrieving it from event object - void set_profiling(uint64_t duration_nsec) { + /// @brief Set event profiling data instead of retrieving it from event object + /// @param duration_nsec duration in nanoseconds + void set_profiling_duration(uint64_t duration_nsec) { auto stage = instrumentation::profiling_stage::executing; auto duration = std::chrono::nanoseconds(duration_nsec); auto period = std::make_shared(duration); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index d223062f2d7f00..33d87ea1c394db 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -555,7 +555,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { if (!duration.empty()) { OPENVINO_ASSERT(duration.size() == 1, "[GPU] oneDNN profiling data is expected to have info only for single primitive ", "actual number is ", duration.size()); - event->set_profiling(duration[0]); + event->set_profiling_duration(duration[0]); } } else { From ca7f6cf519260f25bdeafb7a9985e592de39962d Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 7 Jan 2026 16:03:23 +0000 Subject: [PATCH 59/74] Remove unnecessary engine parameter --- .../intel_gpu/src/graph/impls/ocl/custom_primitive.cpp | 4 ++-- src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp | 2 +- src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp | 2 +- .../intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp | 4 ++-- src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp | 4 ++-- .../intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp | 4 ++-- src/plugins/intel_gpu/src/graph/include/primitive_inst.h | 2 +- src/plugins/intel_gpu/src/graph/program.cpp | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp index b83b594949a882..0f583a849fb754 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp @@ -62,8 +62,8 @@ struct custom_gpu_primitive_impl : typed_primitive_impl { this->can_share_kernels = kernels_cache.get_kernels_reuse(); } - void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0], e)); + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[0])); this->can_share_kernels = kernels_cache.get_kernels_reuse(); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp index 27aa2ce27446bd..a1f5c0f1ac1b63 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp @@ -381,7 +381,7 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co } } -kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id, const engine& e) const { +kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const { auto res = _cached_kernels.find(id); OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!"); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp index ee65b68be0138c..3b6a5cf78032dd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp @@ -123,7 +123,7 @@ class kernels_cache { uint32_t prog_id, std::shared_ptr task_executor = nullptr, const std::map& batch_headers = {}); - kernel::ptr get_kernel_from_cached_kernels(std::string id, const engine& e) const; + kernel::ptr get_kernel_from_cached_kernels(std::string id) const; std::vector get_kernels(const kernel_impl_params& params) const; void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp index 3da69b2aa48bfc..71e8571467d322 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp @@ -126,12 +126,12 @@ struct multi_stage_primitive : public typed_primitive_impl { this->can_share_kernels = kernels_cache.get_kernels_reuse(); } - void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { _kernels.clear(); _kernels.reserve(cached_kernel_ids.size()); for (size_t k = 0; k < cached_kernel_ids.size(); ++k) { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e)); + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k])); } this->can_share_kernels = kernels_cache.get_kernels_reuse(); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 9d93413d8016e8..d604d5a8a3467a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -149,12 +149,12 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { this->can_share_kernels = kernels_cache.get_kernels_reuse(); } - void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const engine& e) override { + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { _kernels.clear(); _kernels.reserve(cached_kernel_ids.size()); for (size_t k = 0; k < cached_kernel_ids.size(); ++k) { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k], e)); + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k])); } this->can_share_kernels = kernels_cache.get_kernels_reuse(); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp index 1f616ab80d9b08..f9756be883dc3a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/primitive_ocl_base.hpp @@ -145,10 +145,10 @@ struct PrimitiveImplOCL : public cldnn::primitive_impl { } } - void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector& cached_kernel_ids, const cldnn::engine& e) override { + void init_by_cached_kernels(const cldnn::kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { OPENVINO_ASSERT(cached_kernel_ids.size() == _order.size()); for (size_t i = 0; i < cached_kernel_ids.size(); ++i) { - _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i], e); + _stages[_order[i]]->kernel = kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[i]); } this->can_share_kernels = kernels_cache.get_kernels_reuse(); } diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index 249a78ba10d6ef..aeba603982476d 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -79,7 +79,7 @@ struct primitive_impl { virtual bool is_cpu() const { return true; } virtual bool is_onednn() const { return false; } virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0; - virtual void init_by_cached_kernels(const kernels_cache&, std::vector& cached_kernel_ids, const engine& e) {} + virtual void init_by_cached_kernels(const kernels_cache&, std::vector& cached_kernel_ids) {} virtual std::vector get_cached_kernel_ids(const kernels_cache&) { return {}; } virtual std::unique_ptr clone() const = 0; virtual std::vector> get_kernels_source() { return {}; } diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index a9699ac1ad9fd9..f9bf639a645cbc 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -2043,7 +2043,7 @@ void program::load(cldnn::BinaryInputBuffer& ib, std::vector cached_kernel_ids; ib >> cached_kernel_ids; - p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids, _engine); + p_node.selected_impl->init_by_cached_kernels(get_kernels_cache(), cached_kernel_ids); } } From 674556cf2f28be9655d419a84426749a3dfd1eb4 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 8 Jan 2026 10:15:00 +0000 Subject: [PATCH 60/74] Update L0 OneDNN submodule --- src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu index 5d3bb07d57b485..867c4222ffd8f7 160000 --- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -1 +1 @@ -Subproject commit 5d3bb07d57b4853d3d9643a11117a4d7c5a380f2 +Subproject commit 867c4222ffd8f7fde75f2e3c9340fbcdca951c54 From 863ea10e74be425aaaa7ff35d5b7e210964ba651 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 8 Jan 2026 12:18:55 +0000 Subject: [PATCH 61/74] Update onednn L0 submodule and fix profiling --- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 +- src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 98bc5358f8a417..76b3268ef995bb 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -397,7 +397,7 @@ dnnl::stream& ze_stream::get_onednn_stream() { OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue"); OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices"); if (!_onednn_stream) { - _onednn_stream = std::make_shared(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list)); + _onednn_stream = std::make_shared(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list, m_ev_factory->is_profiling_enabled())); } return *_onednn_stream; diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu index 867c4222ffd8f7..e7663948687f11 160000 --- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -1 +1 @@ -Subproject commit 867c4222ffd8f7fde75f2e3c9340fbcdca951c54 +Subproject commit e7663948687f1192ad87444fb20e1f5210ccc1a8 From 0cdb6f5415b7ca324a25a2455bca2b9ef7eb8e2d Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 8 Jan 2026 15:07:18 +0000 Subject: [PATCH 62/74] Restore cpu checks in primitive_base --- .../intel_gpu/src/graph/impls/ocl/primitive_base.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index d604d5a8a3467a..8ff102848b0dd4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -136,6 +136,9 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override { + if (is_cpu()) { + return; + } _kernels.clear(); if (!_kernel_data.kernels.empty()) { auto compiled_kernels = kernels_cache.get_kernels(params); @@ -150,6 +153,9 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { + if (is_cpu()) { + return; + } _kernels.clear(); _kernels.reserve(cached_kernel_ids.size()); @@ -191,7 +197,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void set_arguments_impl(typed_primitive_inst& instance) override { - if (instance.can_be_optimized()) { + if (instance.can_be_optimized() || is_cpu()) { return; } @@ -295,6 +301,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { + if (is_cpu()) + return; OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); auto& kernel_vec = kernels.begin()->second; _kernels.clear(); From 86c81b442bc1a7d9b2b0e4675d84d1ca29a22b87 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 8 Jan 2026 16:03:43 +0000 Subject: [PATCH 63/74] Reuse check_allocatable for ocl and l0 --- .../include/intel_gpu/runtime/engine.hpp | 2 +- src/plugins/intel_gpu/src/runtime/engine.cpp | 45 +++++++++++++++++++ .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 45 ------------------- .../intel_gpu/src/runtime/ocl/ocl_engine.hpp | 1 - .../intel_gpu/src/runtime/ze/ze_engine.cpp | 44 ------------------ .../intel_gpu/src/runtime/ze/ze_engine.hpp | 1 - 6 files changed, 46 insertions(+), 92 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index 3e61a6e428b4cd..a307eedff4a35b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -84,7 +84,7 @@ class engine { /// Checks whether two memory objects represents the same physical memory virtual bool is_the_same_buffer(const memory& mem1, const memory& mem2) = 0; - virtual bool check_allocatable(const layout& layout, allocation_type type) = 0; + virtual bool check_allocatable(const layout& layout, allocation_type type); /// Returns basic allocation type which will be used as a fallback when allocation type is not specified or device doesn't support some features. virtual allocation_type get_default_allocation_type() const = 0; diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index f35d4d577230f4..a4165e1f50ef45 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -294,4 +294,49 @@ std::shared_ptr engine::create(engine_types engine_type, runtime_ return engine::create(engine_type, runtime_type, device); } +bool engine::check_allocatable(const layout& layout, allocation_type type) { + OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type); + + if (!get_enable_large_allocations()) { + bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size); + + // When dynamic shape upper bound makes bigger buffer, then return false. + if (exceed_allocatable_mem_size && layout.is_dynamic()) { + OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); + return false; + } + + OPENVINO_ASSERT(!exceed_allocatable_mem_size, + "[GPU] Exceeded max size of memory object allocation: ", + "requested ", layout.bytes_count(), " bytes, " + "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.", + "Please try to reduce batch size or use lower precision."); + } + + auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); + auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size()); + + // When dynamic shape upper bound makes bigger buffer, then return false. + if (exceed_available_mem_size && layout.is_dynamic()) { + OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); + return false; + } + +#ifdef __unix__ + // Prevent from being killed by Ooo Killer of Linux + OPENVINO_ASSERT(!exceed_available_mem_size, + "[GPU] Exceeded max size of memory allocation: ", + "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", + "but available memory size is ", get_max_memory_size(), " bytes"); +#else + if (exceed_available_mem_size) { + GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " + << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; + GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; + } +#endif + + return true; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index b3a5c2266f165e..6ea05d592a5d7c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -101,51 +101,6 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const : allocation_type::unknown; } -bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) { - OPENVINO_ASSERT(supports_allocation(type) || type == allocation_type::cl_mem, "[GPU] Unsupported allocation type: ", type); - - if (!get_enable_large_allocations()) { - bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size); - - // When dynamic shape upper bound makes bigger buffer, then return false. - if (exceed_allocatable_mem_size && layout.is_dynamic()) { - OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); - return false; - } - - OPENVINO_ASSERT(!exceed_allocatable_mem_size, - "[GPU] Exceeded max size of memory object allocation: ", - "requested ", layout.bytes_count(), " bytes, " - "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.", - "Please try to reduce batch size or use lower precision."); - } - - auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); - auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size()); - - // When dynamic shape upper bound makes bigger buffer, then return false. - if (exceed_available_mem_size && layout.is_dynamic()) { - OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); - return false; - } - -#ifdef __unix__ - // Prevent from being killed by Ooo Killer of Linux - OPENVINO_ASSERT(!exceed_available_mem_size, - "[GPU] Exceeded max size of memory allocation: ", - "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", - "but available memory size is ", get_max_memory_size(), " bytes"); -#else - if (exceed_available_mem_size) { - GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " - << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; - GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; - } -#endif - - return true; -} - memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index 84908c7addc969..e6f1b9010e2d51 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -29,7 +29,6 @@ class ocl_engine : public engine { memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; - bool check_allocatable(const layout& layout, allocation_type type) override; void* get_user_context() const override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 6b64c617560ce1..6e3b879091fad6 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -74,50 +74,6 @@ allocation_type ze_engine::detect_usm_allocation_type(const void* memory) const return ze::gpu_usm::detect_allocation_type(this, memory); } -bool ze_engine::check_allocatable(const layout& layout, allocation_type type) { - OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type); - - bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size); - - // When dynamic shape upper bound makes bigger buffer, then return false. - if (exceed_allocatable_mem_size && layout.is_dynamic()) { - OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); - return false; - } - - OPENVINO_ASSERT(!exceed_allocatable_mem_size, - "[GPU] Exceeded max size of memory object allocation: ", - "requested ", layout.bytes_count(), " bytes, " - "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.", - "Please try to reduce batch size or use lower precision."); - - auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); - auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size()); - - // When dynamic shape upper bound makes bigger buffer, then return false. - if (exceed_available_mem_size && layout.is_dynamic()) { - OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); - return false; - } - -#ifdef __unix__ - // Prevent from being killed by Ooo Killer of Linux - OPENVINO_ASSERT(!exceed_available_mem_size, - "[GPU] Exceeded max size of memory allocation: ", - "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", - "but available memory size is ", get_max_memory_size(), " bytes"); -#else - if (exceed_available_mem_size) { - GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " - << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; - GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; - return false; - } -#endif - - return true; -} - memory::ptr ze_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp index 586b830a2de93e..3c2cd4ca041f6c 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -26,7 +26,6 @@ class ze_engine : public engine { memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; - bool check_allocatable(const layout& layout, allocation_type type) override; void* get_user_context() const override; From 8621d68a57dba5490b1ab2498f4c7b1e858677b7 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Thu, 8 Jan 2026 16:34:48 +0000 Subject: [PATCH 64/74] Use asserts instead of throw --- .../intel_gpu/src/runtime/ze/ze_engine.cpp | 21 ++++++++----------- .../intel_gpu/src/runtime/ze/ze_event.hpp | 4 +--- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 6e3b879091fad6..25ff4b9972719e 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -132,18 +132,15 @@ memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_l if (new_layout.format.is_image_2d()) { OPENVINO_NOT_IMPLEMENTED; } - if (memory_capabilities::is_usm_type(memory.get_allocation_type())) { - auto& new_buf = reinterpret_cast(memory); - auto ptr = new_buf.get_buffer().get(); - auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset); - return std::make_shared(this, - new_layout, - sub_buffer, - memory.get_allocation_type(), - memory.get_mem_tracker()); - } else { - OPENVINO_THROW("[GPU] Trying to create subbuffer for non usm memory"); - } + OPENVINO_ASSERT(!memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory"); + auto& new_buf = reinterpret_cast(memory); + auto ptr = new_buf.get_buffer().get(); + auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset); + return std::make_shared(this, + new_layout, + sub_buffer, + memory.get_allocation_type(), + memory.get_mem_tracker()); } bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp index 4c7e02ed32ee08..676208490f92d1 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp @@ -20,9 +20,7 @@ struct ze_event : public ze_base_event { , m_factory(factory) , m_event(ev) { // Ensure event handle is not null - if (ev == nullptr) { - OPENVINO_THROW("[GPU] Trying to create event with null handle"); - } + OPENVINO_ASSERT(ev != nullptr, "[GPU] Trying to create event with null handle"); } ze_event(const ze_event &) = delete; ze_event& operator=(const ze_event &) = delete; From 8d0b5bf2464b023c2636e298c623e9df01d66d56 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 9 Jan 2026 13:47:09 +0000 Subject: [PATCH 65/74] Fix get_user_context --- src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 25ff4b9972719e..fdeaef71f87fb1 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -162,7 +162,7 @@ std::shared_ptr ze_engine::create_kernel_builder() const { void* ze_engine::get_user_context() const { auto& casted = downcast(*_device); - return static_cast(casted.get_driver()); + return static_cast(casted.get_context()); } stream::ptr ze_engine::create_stream(const ExecutionConfig& config) const { From d64e6b67008e9678ed2ed835f03ddf9aee07cd3a Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 9 Jan 2026 14:39:57 +0000 Subject: [PATCH 66/74] Move common code to engine class --- .../intel_gpu/include/intel_gpu/runtime/engine.hpp | 10 ++++++++-- src/plugins/intel_gpu/src/runtime/engine.cpp | 11 +++++++++++ src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp | 9 --------- src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp | 9 --------- src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 9 --------- src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp | 11 ----------- 6 files changed, 19 insertions(+), 40 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index a307eedff4a35b..fa57dd19af6f59 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -142,7 +142,7 @@ class engine { virtual stream_ptr create_stream(const ExecutionConfig& config, void *handle) const = 0; /// Returns service stream which can be used during program build and optimizations - virtual stream& get_service_stream() const = 0; + virtual stream& get_service_stream() const; virtual std::shared_ptr create_kernel_builder() const = 0; @@ -157,7 +157,7 @@ class engine { virtual void create_onednn_engine(const ExecutionConfig& config) = 0; /// Returns onednn engine object which shares device and context with current engine - virtual dnnl::engine& get_onednn_engine() const = 0; + virtual dnnl::engine& get_onednn_engine() const; #endif /// Factory method which creates engine object with impl configured by @p engine_type @@ -177,6 +177,12 @@ class engine { engine(const device::ptr device); const device::ptr _device; bool enable_large_allocations = false; + std::unique_ptr _service_stream; + +#ifdef ENABLE_ONEDNN_FOR_GPU + std::mutex onednn_mutex; + std::shared_ptr _onednn_engine; +#endif std::array, static_cast(allocation_type::max_value)> _memory_usage_data{}; std::array, static_cast(allocation_type::max_value)> _peak_memory_usage_data{}; diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index a4165e1f50ef45..959fb414a51c68 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -339,4 +339,15 @@ bool engine::check_allocatable(const layout& layout, allocation_type type) { return true; } +#ifdef ENABLE_ONEDNN_FOR_GPU +dnnl::engine& engine::get_onednn_engine() const { + OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called"); + return *_onednn_engine; +} +#endif + +stream& engine::get_service_stream() const { + return *_service_stream; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 6ea05d592a5d7c..6e45fdd788a109 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -71,11 +71,6 @@ void ocl_engine::create_onednn_engine(const ExecutionConfig& config) { #endif } } - -dnnl::engine& ocl_engine::get_onednn_engine() const { - OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called"); - return *_onednn_engine; -} #endif const cl::Context& ocl_engine::get_cl_context() const { @@ -281,10 +276,6 @@ stream::ptr ocl_engine::create_stream(const ExecutionConfig& config, void* handl return std::make_shared(*this, config, handle); } -stream& ocl_engine::get_service_stream() const { - return *_service_stream; -} - std::shared_ptr ocl_engine::create(const device::ptr device, runtime_types runtime_type) { return std::make_shared(device, runtime_type); } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index e6f1b9010e2d51..e9d152e20fd88c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -43,26 +43,17 @@ class ocl_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config) const override; stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; - stream& get_service_stream() const override; std::shared_ptr create_kernel_builder() const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; - // Returns onednn engine object which shares device and context with current engine - dnnl::engine& get_onednn_engine() const override; #endif static std::shared_ptr create(const device::ptr device, runtime_types runtime_type); private: std::string _extensions; - std::unique_ptr _service_stream; - -#ifdef ENABLE_ONEDNN_FOR_GPU - std::mutex onednn_mutex; - std::shared_ptr _onednn_engine; -#endif }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index fdeaef71f87fb1..d00750696e2c06 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -45,11 +45,6 @@ void ze_engine::create_onednn_engine(const ExecutionConfig& config) { _onednn_engine = std::make_shared(dnnl::l0_interop::make_engine(casted->get_driver(), casted->get_device(), casted->get_context())); } } - -dnnl::engine& ze_engine::get_onednn_engine() const { - OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called"); - return *_onednn_engine; -} #endif const ze_driver_handle_t ze_engine::get_driver() const { @@ -173,10 +168,6 @@ stream::ptr ze_engine::create_stream(const ExecutionConfig& config, void* handle OPENVINO_NOT_IMPLEMENTED; } -stream& ze_engine::get_service_stream() const { - return *_service_stream; -} - std::shared_ptr ze_engine::create(const device::ptr device, runtime_types runtime_type) { return std::make_shared(device, runtime_type); } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp index 3c2cd4ca041f6c..24d0736b3fbbfc 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -38,25 +38,14 @@ class ze_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config) const override; stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; - stream& get_service_stream() const override; std::shared_ptr create_kernel_builder() const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; - // Returns onednn engine object which shares device and context with current engine - dnnl::engine& get_onednn_engine() const override; #endif static std::shared_ptr create(const device::ptr device, runtime_types runtime_type); - -private: - std::unique_ptr _service_stream; - -#ifdef ENABLE_ONEDNN_FOR_GPU - std::mutex onednn_mutex; - std::shared_ptr _onednn_engine; -#endif }; } // namespace ze From 5ccd52c87df872021c14638b74f42576e92b6764 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 9 Jan 2026 15:16:35 +0000 Subject: [PATCH 67/74] Rename L0 timeout --- src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp | 2 +- src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp index 4ff1100a7dd9e9..d7456d74c7459f 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -32,7 +32,7 @@ namespace cldnn { namespace ze { -static constexpr uint64_t default_timeout = std::numeric_limits::max(); +static constexpr uint64_t endless_wait = std::numeric_limits::max(); static constexpr ze_module_format_t ze_module_format_oclc = (ze_module_format_t) 3U; } // namespace ze diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp index 0ca4b0b243d2e0..03e342d0f571ad 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp @@ -13,7 +13,7 @@ using namespace cldnn; using namespace ze; void ze_counter_based_event::wait_impl() { - OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout)); + OV_ZE_EXPECT(zeEventHostSynchronize(m_event, endless_wait)); } void ze_counter_based_event::set_impl() { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp index 7d99b06aa619b6..6fb52b9cac2d30 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -18,7 +18,7 @@ void ze_event::reset() { } void ze_event::wait_impl() { - OV_ZE_EXPECT(zeEventHostSynchronize(m_event, default_timeout)); + OV_ZE_EXPECT(zeEventHostSynchronize(m_event, endless_wait)); } void ze_event::set_impl() { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp index e54348e7b12c25..852fecf893efa8 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp @@ -14,7 +14,7 @@ using namespace ze; void ze_events::wait_impl() { if (m_last_event) { - OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, default_timeout)); + OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, endless_wait)); } } diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index d86d1fe3db8fc2..4a7bdcface6c4b 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -118,7 +118,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::re nullptr, 0, nullptr)); - OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), default_timeout)); + OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), endless_wait)); _mapped_ptr = _host_buffer.get(); } else { _mapped_ptr = _buffer.get(); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index 76b3268ef995bb..ab0e9991406ce1 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -344,7 +344,7 @@ void ze_stream::flush() const { } void ze_stream::finish() const { - OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, default_timeout)); + OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, endless_wait)); } void ze_stream::wait_for_events(const std::vector& events) { From 786fbb6e237e9f6f15bf1ddeca56fdd8accdadab Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Mon, 12 Jan 2026 18:18:44 +0000 Subject: [PATCH 68/74] Fix test build errors --- .../include/intel_gpu/runtime/kernel.hpp | 4 ++++ .../intel_gpu/src/runtime/ocl/ocl_kernel.hpp | 7 +++++++ .../intel_gpu/src/runtime/ze/ze_kernel.hpp | 8 ++++++++ .../intel_gpu/tests/functional/CMakeLists.txt | 2 -- .../ze_remote_tensor_tests.cpp | 2 -- .../functional/subgraph_tests/serialize.cpp | 2 +- .../functional/subgraph_tests/vlsdpa.cpp | 1 - .../intel_gpu/tests/unit/CMakeLists.txt | 19 ++++++++++++++++++- .../tests/unit/passes/kernels_cache_test.cpp | 15 ++------------- 9 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp index 54bece374b99f2..be273cd1d50aa7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp @@ -16,6 +16,10 @@ class kernel { public: using ptr = std::shared_ptr; virtual std::shared_ptr clone(bool reuse_kernel_handle = false) const = 0; + /// @brief Check if objects share the same handle to the kernel instance + /// @param other kernel object for comparison + /// @return true if underlying kernel handles are the same, false otherwise + virtual bool is_same(const kernel &other) const = 0; virtual ~kernel() = default; virtual std::string get_id() const = 0; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp index d46fda1ee02f15..446fe51a7193c9 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp @@ -33,6 +33,13 @@ class ocl_kernel : public kernel { return std::make_shared(get_handle().clone(), _kernel_id); } + virtual bool is_same(const kernel &other) const { + auto other_ptr = dynamic_cast(&other); + if (other_ptr == nullptr) { + return false; + } + return get_handle().get() == other_ptr->get_handle().get(); + } std::vector get_binary() const override; std::string get_build_log() const override; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp index 3a726e24930d24..9d57a571907a89 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -67,6 +67,14 @@ class ze_kernel : public kernel { } } + virtual bool is_same(const kernel &other) const override { + auto other_ptr = dynamic_cast(&other); + if (other_ptr == nullptr) { + return false; + } + return get_kernel_handle() == other_ptr->get_kernel_handle(); + } + std::vector get_binary() const override { size_t binary_size = 0; ze_module_handle_t module_handle = get_module_handle(); diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index 9376bce54ce98e..3dd2da44dc6d41 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -32,8 +32,6 @@ ov_add_test_target( LINK_LIBRARIES openvino::reference funcSharedTests - OpenCL::NewHeaders # should come before OpenCL::OpenCL - OpenCL::OpenCL LABELS OV GPU ) diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp index f1a0b17b85e565..b593552272d691 100644 --- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp @@ -7,13 +7,11 @@ #include "openvino/runtime/intel_gpu/remote_properties.hpp" #include "openvino/runtime/remote_tensor.hpp" -#include "remote_tensor_tests/helpers.hpp" #include "shared_test_classes/base/ov_behavior_test_utils.hpp" TEST(ZeRemoteContext, smoke_CorrectContextType) { auto core = ov::Core(); auto remote_context = core.get_default_context(ov::test::utils::DEVICE_GPU); - ASSERT_FALSE(remote_context.is()); ASSERT_EQ(remote_context.get_params().at(ov::intel_gpu::context_type.name()), ov::intel_gpu::ContextType::ZE); } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp index 09b5052df9dc07..3ca4052b22d01d 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp @@ -46,7 +46,7 @@ class LSTMSequenceTest : virtual public SerializeBaseTest { class GRUSequenceTest : virtual public SerializeBaseTest { public: - void SetUp() { + void SetUp() override { std::string cacheDirName = "cache_gru"; auto init_shape = ov::PartialShape({1, 30, 512}); auto batch_size = static_cast(init_shape[0].get_length()); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp index 38be79bcb5391e..f2fae883a0f718 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp @@ -11,7 +11,6 @@ #include "shared_test_classes/base/ov_subgraph.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/engine_configuration.hpp" -#include "openvino/runtime/intel_gpu/ocl/ocl.hpp" #include "openvino/opsets/opset13.hpp" #include "ov_ops/vl_sdpa.hpp" diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt index fd04fa1d989d0d..8b56fedf4d4a06 100644 --- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt @@ -34,6 +34,24 @@ file(GLOB_RECURSE SOURCES_MAIN "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/simple_math.cpp" ) +# Those tests have dependency on OpenCL runtime +# Need to be excluded from the build with a different runtime +file(GLOB_RECURSE SOURCES_WITH_OCL_RT + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/device_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/engine_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/events_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/network_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/usm_memory_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/convert_color_gpu_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/cl_mem_input_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/mem_perf_test.cpp" + ) +if (NOT GPU_RT_TYPE STREQUAL "OCL") + foreach (SOURCE_FILE IN LISTS SOURCES_WITH_OCL_RT) + list (REMOVE_ITEM SOURCES_MAIN ${SOURCE_FILE}) + endforeach() +endif() + if (NOT ENABLE_ONEDNN_FOR_GPU) set(EXCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn/") foreach (SOURCE_FILE IN LISTS SOURCES_MAIN) @@ -70,7 +88,6 @@ endif() set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph - OpenCL::OpenCL gtest gtest_main gflags diff --git a/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp index 30225132c35488..871038a6c17f31 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp @@ -4,7 +4,6 @@ #include "test_utils.h" -#include "runtime/ocl/ocl_kernel.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/graph/program.hpp" #include "intel_gpu/graph/network.hpp" @@ -181,12 +180,7 @@ TEST(kernels_cache, reuse_kernels_property) { auto conv1_kern = cache.get_cached_kernel_id(conv1_kernels[idx]); auto conv2_kern = cache.get_cached_kernel_id(conv2_kernels[idx]); ASSERT_EQ(conv1_kern, conv2_kern); - - auto conv1_ocl_kernel = std::dynamic_pointer_cast(conv1_kernels[idx]); - auto conv2_ocl_kernel = std::dynamic_pointer_cast(conv2_kernels[idx]); - if (conv1_ocl_kernel && conv2_ocl_kernel) { - ASSERT_EQ(conv1_ocl_kernel->get_handle().get(), conv2_ocl_kernel->get_handle().get()); - } + ASSERT_TRUE(conv1_kernels[idx]->is_same(*conv2_kernels[idx].get())); } auto& concat1_node = prog->get_node("concat1"); @@ -200,11 +194,6 @@ TEST(kernels_cache, reuse_kernels_property) { auto concat1_kern = cache.get_cached_kernel_id(concat1_kernels[idx]); auto concat2_kern = cache.get_cached_kernel_id(concat2_kernels[idx]); ASSERT_EQ(concat1_kern, concat2_kern); - - auto concat1_ocl_kernel = std::dynamic_pointer_cast(concat1_kernels[idx]); - auto concat2_ocl_kernel = std::dynamic_pointer_cast(concat2_kernels[idx]); - if (concat1_ocl_kernel && concat2_ocl_kernel) { - ASSERT_EQ(concat1_ocl_kernel->get_handle().get(), concat2_ocl_kernel->get_handle().get()); - } + ASSERT_TRUE(concat1_kernels[idx]->is_same(*concat2_kernels[idx].get())); } } From 46905789b039e9d64a53f2960fa29591d589dffa Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 13 Jan 2026 11:06:33 +0000 Subject: [PATCH 69/74] Update L0 OneDNN submodule --- src/plugins/intel_gpu/thirdparty/l0_onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu index e7663948687f11..d28c5b4d0dd906 160000 --- a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -1 +1 @@ -Subproject commit e7663948687f1192ad87444fb20e1f5210ccc1a8 +Subproject commit d28c5b4d0dd90669cd63ff325871d7c87e4a5cfb From 206a580cc908a8f99013558bdc31c5da9a93f011 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 13 Jan 2026 16:00:19 +0000 Subject: [PATCH 70/74] Add mutex to l0 event factory --- .../src/runtime/ze/ze_counter_based_event_factory.cpp | 2 ++ .../src/runtime/ze/ze_counter_based_event_factory.hpp | 4 ++++ src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp | 2 ++ src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp | 3 +++ 4 files changed, 11 insertions(+) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp index 9e50d3bcc762b9..09ad6800869af5 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp @@ -27,6 +27,8 @@ ze_counter_based_event_factory::ze_counter_based_event_factory(const ze_engine & } event::ptr ze_counter_based_event_factory::create_event(uint64_t queue_stamp) { + std::lock_guard lock(_mutex); + ze_event_handle_t event; auto desc = defaultIntelCounterBasedEventDesc; if (is_profiling_enabled()) { diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp index 90e7af9562628a..6a1c93728d93dc 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp @@ -6,6 +6,8 @@ #include "ze_base_event_factory.hpp" +#include + namespace cldnn { namespace ze { @@ -15,6 +17,8 @@ struct ze_counter_based_event_factory : public ze_base_event_factory { public: ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling); event::ptr create_event(uint64_t queue_stamp) override; +protected: + std::mutex _mutex; }; } // namespace ze } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp index 797b8ef9679651..28bfd4a5b980d6 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp @@ -18,6 +18,8 @@ ze_event_factory::ze_event_factory(const ze_engine &engine, bool enable_profilin , m_num_used(0) { } event::ptr ze_event_factory::create_event(uint64_t queue_stamp) { + std::lock_guard lock(_mutex); + if (m_num_used >= m_capacity || !m_current_pool) { m_num_used = 0; ze_event_pool_flags_t flags = is_profiling_enabled() ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp index 9ed0244446a6ee..d54aff8ecce344 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp @@ -7,6 +7,8 @@ #include "ze_base_event_factory.hpp" #include "ze_event_pool.hpp" +#include "mutex" + namespace cldnn { namespace ze { @@ -16,6 +18,7 @@ struct ze_event_factory : public ze_base_event_factory { ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255); event::ptr create_event(uint64_t queue_stamp) override; protected: + std::mutex _mutex; std::shared_ptr m_current_pool; const uint32_t m_capacity; uint32_t m_num_used; From d71ae1251b54efa752234024b7431168d0da4c25 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Tue, 13 Jan 2026 16:09:58 +0000 Subject: [PATCH 71/74] Adjust ze_events naming --- .../intel_gpu/src/runtime/ze/ze_events.cpp | 16 ++++++------- .../intel_gpu/src/runtime/ze/ze_events.hpp | 24 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp index 852fecf893efa8..f71741c19f6c6c 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp @@ -13,8 +13,8 @@ using namespace cldnn; using namespace ze; void ze_events::wait_impl() { - if (m_last_event) { - OV_ZE_EXPECT(zeEventHostSynchronize(m_last_event, endless_wait)); + if (_last_ze_event) { + OV_ZE_EXPECT(zeEventHostSynchronize(_last_ze_event, endless_wait)); } } @@ -24,11 +24,11 @@ void ze_events::set_impl() { } bool ze_events::is_set_impl() { - if (!m_last_event) { + if (!_last_ze_event) { return true; } - auto ret = zeEventQueryStatus(m_last_event); + auto ret = zeEventQueryStatus(_last_ze_event); switch (ret) { case ZE_RESULT_SUCCESS: return true; @@ -89,10 +89,10 @@ bool ze_events::get_profiling_info_impl(std::list& all_timestamps) { std::chrono::nanoseconds total_time{0}; @@ -130,8 +130,8 @@ bool ze_events::get_profiling_info_impl(std::list(m_events[i].get()); + for (size_t i = 0; i < _events.size(); i++) { + auto be = downcast(_events[i].get()); auto opt_timestamp = be->query_timestamp(); if (!opt_timestamp.has_value()) { continue; diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp index 9fadd2c6f1a06b..b38a38303e8b59 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp @@ -16,17 +16,17 @@ struct ze_events : public ze_base_event { public: ze_events(std::vector const& ev, const ze_engine &engine) : ze_base_event(0) - , m_engine(engine) { + , _engine(engine) { process_events(ev); } void reset() override { event::reset(); - m_events.clear(); + _events.clear(); } std::optional query_timestamp() override { return std::nullopt; } - ze_event_handle_t get_handle() const override { return m_last_event; } + ze_event_handle_t get_handle() const override { return _last_ze_event; } bool get_profiling_info_impl(std::list& info) override; protected: @@ -38,32 +38,32 @@ struct ze_events : public ze_base_event { for (size_t i = 0; i < ev.size(); i++) { auto multiple_events = dynamic_cast(ev[i].get()); if (multiple_events) { - for (size_t j = 0; j < multiple_events->m_events.size(); j++) { - if (auto base_ev = dynamic_cast(multiple_events->m_events[j].get())) { + for (size_t j = 0; j < multiple_events->_events.size(); j++) { + if (auto base_ev = dynamic_cast(multiple_events->_events[j].get())) { auto current_ev_queue_stamp = base_ev->get_queue_stamp(); if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) { m_queue_stamp = current_ev_queue_stamp; - m_last_event = base_ev->get_handle(); + _last_ze_event = base_ev->get_handle(); } } - m_events.push_back(multiple_events->m_events[j]); + _events.push_back(multiple_events->_events[j]); } } else { if (auto base_ev = dynamic_cast(ev[i].get())) { auto current_ev_queue_stamp = base_ev->get_queue_stamp(); if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) { m_queue_stamp = current_ev_queue_stamp; - m_last_event = base_ev->get_handle(); + _last_ze_event = base_ev->get_handle(); } } - m_events.push_back(ev[i]); + _events.push_back(ev[i]); } } } - ze_event_handle_t m_last_event = nullptr; - std::vector m_events; - const ze_engine &m_engine; + ze_event_handle_t _last_ze_event = nullptr; + std::vector _events; + const ze_engine &_engine; }; } // namespace ze From 0a4884601a8e9a7730e1de6cb01bf7b14f427547 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 14 Jan 2026 12:45:43 +0000 Subject: [PATCH 72/74] Remove unnecessary comment --- src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp index 4a7bdcface6c4b..e3d0e924fa2589 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -152,7 +152,6 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vecto ev_ze, ze_dep_events.size(), ze_dep_events.data())); - // FIXME: when not blocking pattern goes out of scope if (blocking) { ev->wait(); } From ae5afd8612d9bee7bfbcfe17c2ecfe112355d934 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Wed, 14 Jan 2026 15:17:17 +0000 Subject: [PATCH 73/74] Improve engine and runtime type selection --- .../intel_gpu/plugin/remote_context.hpp | 20 ---------- .../intel_gpu/runtime/device_query.hpp | 13 +++++++ src/plugins/intel_gpu/src/plugin/plugin.cpp | 3 +- .../intel_gpu/src/plugin/remote_context.cpp | 9 ++--- .../intel_gpu/src/runtime/device_query.cpp | 38 +++++++++++++++++++ .../intel_gpu/src/runtime/ze/ze_stream.cpp | 12 ++++-- .../tests/unit/test_utils/test_utils.cpp | 7 +--- 7 files changed, 65 insertions(+), 37 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp index 3606e95e5d9521..e210d332cc6296 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp @@ -23,26 +23,6 @@ namespace ov::intel_gpu { -inline std::pair get_device_query_params() { - #ifdef OV_GPU_WITH_ZE_RT - auto runtime_type = cldnn::runtime_types::ze; - #ifdef OV_GPU_WITH_SYCL - auto engine_type = cldnn::engine_types::sycl; - #else - auto engine_type = cldnn::engine_types::ze; - #endif - #else - auto runtime_type = cldnn::runtime_types::ocl; - #ifdef OV_GPU_WITH_SYCL - auto engine_type = cldnn::engine_types::sycl; - #else - auto engine_type = cldnn::engine_types::ocl; - #endif - #endif - - return {engine_type, runtime_type}; -} - class RemoteContextImpl : public ov::IRemoteContext { public: using Ptr = std::shared_ptr; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp index ecb82795c0d920..d5291f50486152 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp @@ -17,6 +17,12 @@ namespace cldnn { struct device_query { public: static int device_id; + /// @brief Get default engine type + static engine_types get_default_engine_type(); + + /// @brief Get default runtime type + static runtime_types get_default_runtime_type(); + explicit device_query(engine_types engine_type, runtime_types runtime_type, void* user_context = nullptr, @@ -25,6 +31,13 @@ struct device_query { int target_tile_id = -1, bool initialize_devices = false); + /// @brief Create device query with default values for engine type and runtime type + explicit device_query(void* user_context = nullptr, + void* user_device = nullptr, + int ctx_device_id = 0, + int target_tile_id = -1, + bool initialize_devices = false); + std::map get_available_devices() const { return _available_devices; } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 34ac490f3b2a09..e8c18242695624 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -217,8 +217,7 @@ Plugin::Plugin() { set_device_name("GPU"); register_primitives(); - auto rt_params = get_device_query_params(); - cldnn::device_query device_query(rt_params.first, rt_params.second); + cldnn::device_query device_query; m_device_map = device_query.get_available_devices(); // Set default configs for each device diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp index ac68496feb2d97..63c8bd043be974 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp @@ -65,11 +65,9 @@ RemoteContextImpl::RemoteContextImpl(const std::mapget_info().dev_name << ")" << std::endl; - auto rt_params = get_device_query_params(); - m_device->initialize(); // Initialize associated device before use - m_engine = cldnn::engine::create(rt_params.first, rt_params.second, m_device); + m_engine = cldnn::engine::create( + cldnn::device_query::get_default_engine_type(), cldnn::device_query::get_default_runtime_type(), m_device); init_properties(); diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp index 7ec5d6d90edabe..5e6ba1576a17c3 100644 --- a/src/plugins/intel_gpu/src/runtime/device_query.cpp +++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp @@ -10,6 +10,44 @@ namespace cldnn { int device_query::device_id = -1; + +engine_types device_query::get_default_engine_type() { + auto engine_type = engine_types::ocl; +#ifdef OV_GPU_WITH_ZE_RT + engine_type = engine_types::ze; +#endif +#ifdef OV_GPU_WITH_OCL_RT + engine_type = engine_types::ocl; +#endif +#ifdef OV_GPU_WITH_SYCL + engine_type = engine_types::sycl; +#endif + return engine_type; +} +runtime_types device_query::get_default_runtime_type() { + auto rt_type = runtime_types::ocl; +#ifdef OV_GPU_WITH_ZE_RT + rt_type = runtime_types::ze; +#endif +#ifdef OV_GPU_WITH_OCL_RT + rt_type = runtime_types::ocl; +#endif + return rt_type; +} + +device_query::device_query(void* user_context, + void* user_device, + int ctx_device_id, + int target_tile_id, + bool initialize_devices) + : device_query(get_default_engine_type(), + get_default_runtime_type(), + user_context, + user_device, + ctx_device_id, + target_tile_id, + initialize_devices) {} + device_query::device_query(engine_types engine_type, runtime_types runtime_type, void* user_context, diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp index ab0e9991406ce1..1050c9b753fa76 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -206,18 +206,22 @@ ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES; cp_offload_desc.copyOffloadEnabled = true; cp_offload_desc.pNext = nullptr; - if (info.supports_cp_offload) { + bool use_cp_offload = info.supports_cp_offload; + if (use_cp_offload) { command_queue_desc.pNext = &cp_offload_desc; - } else { - GPU_DEBUG_INFO << "Copy offload hint is not supported" << std::endl; } OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); - if (m_queue_type == QueueTypes::in_order && info.supports_counter_based_events) { + bool use_counter_based_events = m_queue_type == QueueTypes::in_order && info.supports_counter_based_events; + if (use_counter_based_events) { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } else { m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); } + GPU_DEBUG_INFO << "[GPU] Created L0 stream (" + << "use_cp_offload=" << use_cp_offload + << ", use_counter_based_events=" << use_counter_based_events + << ")" << std::endl; } ze_stream::~ze_stream() { diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp index fe012bf8da99f9..be8e390749b801 100644 --- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp @@ -306,11 +306,8 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine, } std::shared_ptr create_test_engine() { -#ifdef OV_GPU_WITH_ZE_RT - auto ret = cldnn::engine::create(engine_types::ze, runtime_types::ze); -#elif OV_GPU_WITH_OCL_RT - auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl); -#endif + auto ret = cldnn::engine::create( + cldnn::device_query::get_default_engine_type(), cldnn::device_query::get_default_runtime_type()); #ifdef ENABLE_ONEDNN_FOR_GPU if (ret->get_device_info().supports_immad) ret->create_onednn_engine({}); From d964b511aa09324e85ac35a223df297975355c63 Mon Sep 17 00:00:00 2001 From: "Kasprzak, Jakub" Date: Fri, 16 Jan 2026 12:42:03 +0000 Subject: [PATCH 74/74] Fix typo in assert --- src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index d00750696e2c06..5168e0364c4bd8 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -127,7 +127,7 @@ memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_l if (new_layout.format.is_image_2d()) { OPENVINO_NOT_IMPLEMENTED; } - OPENVINO_ASSERT(!memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory"); + OPENVINO_ASSERT(memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory"); auto& new_buf = reinterpret_cast(memory); auto ptr = new_buf.get_buffer().get(); auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset);