diff --git a/.github/workflows/linux_riscv.yml b/.github/workflows/linux_riscv.yml index 83b03a2fe56290..1e886d80cd2598 100644 --- a/.github/workflows/linux_riscv.yml +++ b/.github/workflows/linux_riscv.yml @@ -156,6 +156,7 @@ jobs: git submodule update --init -- ${OPENVINO_REPO}/thirdparty/telemetry git submodule update --init -- ${OPENVINO_REPO}/src/plugins/intel_cpu git submodule update --init -- ${OPENVINO_REPO}/thirdparty/flatbuffers/flatbuffers + git submodule update --init -- ${OPENVINO_REPO}/thirdparty/level_zero popd # diff --git a/.gitmodules b/.gitmodules index d9733bc0d844c1..fdd61a6915652a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -93,3 +93,6 @@ [submodule "src/plugins/intel_cpu/thirdparty/xbyak_riscv"] path = src/plugins/intel_cpu/thirdparty/xbyak_riscv url = https://github.com/herumi/xbyak_riscv.git +[submodule "src/plugins/intel_gpu/thirdparty/l0_onednn_gpu"] + path = src/plugins/intel_gpu/thirdparty/l0_onednn_gpu + url = https://github.com/jkasprza/oneDNN.git diff --git a/cmake/features.cmake b/cmake/features.cmake index c9a3e898719296..40560ea61826f8 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -41,6 +41,12 @@ else() set(ENABLE_ONEDNN_FOR_GPU_DEFAULT ON) endif() +# Set default GPU runtime to OCL +set(OV_GPU_DEFAULT_RT "OCL") +if (ENABLE_INTEL_GPU) + ov_option_enum (GPU_RT_TYPE "Type of GPU runtime. Supported value: OCL and L0" ${OV_GPU_DEFAULT_RT} ALLOWED_VALUES L0 OCL) +endif() + ov_dependent_option (ENABLE_ONEDNN_FOR_GPU "Enable oneDNN with GPU support" ${ENABLE_ONEDNN_FOR_GPU_DEFAULT} "ENABLE_INTEL_GPU" OFF) ov_dependent_option (ENABLE_INTEL_NPU "NPU plugin for OpenVINO runtime" ON "X86_64;WIN32 OR LINUX" OFF) diff --git a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp index 53c8de921a747d..cad287b56059bb 100644 --- a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp +++ b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp @@ -24,6 +24,7 @@ using gpu_handle_param = void*; enum class ContextType { OCL = 0, //!< Pure OpenCL context VA_SHARED = 1, //!< Context shared with a video decoding device + ZE = 2, //!< Pure Level0 context }; /** @cond INTERNAL */ @@ -33,6 +34,8 @@ inline std::ostream& operator<<(std::ostream& os, const ContextType& context_typ return os << "OCL"; case ContextType::VA_SHARED: return os << "VA_SHARED"; + case ContextType::ZE: + return os << "ZE"; default: OPENVINO_THROW("Unsupported context type"); } @@ -43,6 +46,8 @@ inline std::istream& operator>>(std::istream& is, ContextType& context_type) { is >> str; if (str == "OCL") { context_type = ContextType::OCL; + } else if (str == "ZE") { + context_type = ContextType::ZE; } else if (str == "VA_SHARED") { context_type = ContextType::VA_SHARED; } else { diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 9556ab5873c616..38e3dafbc3305f 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -8,6 +8,8 @@ endif() set (TARGET_NAME "openvino_intel_gpu_plugin") +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) + if(OV_COMPILER_IS_INTEL_LLVM) # For windows we need to disable warning as error option to make FindSYCL.cmake work if (WIN32) @@ -36,6 +38,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus") endif() +if(WIN32) + add_definitions(-DNOMINMAX) +endif() + if(ENABLE_GPU_DEBUG_CAPS) add_definitions(-DGPU_DEBUG_CONFIG=1) add_definitions(-DENABLE_DEBUG_CAPS=1) @@ -77,6 +83,7 @@ target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include/) ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_gpu/cmake/utils.cmake b/src/plugins/intel_gpu/cmake/utils.cmake new file mode 100644 index 00000000000000..f346f50f8792cf --- /dev/null +++ b/src/plugins/intel_gpu/cmake/utils.cmake @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +function(ov_gpu_set_runtime_interface_for TARGET_NAME) + if(GPU_RT_TYPE STREQUAL "L0") + target_compile_definitions(${TARGET_NAME} PRIVATE OV_GPU_WITH_ZE_RT=1) + target_link_libraries(${TARGET_NAME} PRIVATE LevelZero::LevelZero) + elseif(GPU_RT_TYPE STREQUAL "OCL") + target_compile_definitions(${TARGET_NAME} PRIVATE OV_GPU_WITH_OCL_RT=1) + target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL) + else() + message(FATAL_ERROR "Invalid GPU runtime type: `${GPU_RT_TYPE}` Only `L0` and `OCL` are supported") + endif() +endfunction() diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp index 7425f701f88710..e210d332cc6296 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp @@ -8,6 +8,7 @@ # define NOMINMAX #endif +#include "intel_gpu/runtime/engine_configuration.hpp" #include "openvino/runtime/intel_gpu/remote_properties.hpp" #include "openvino/runtime/iremote_context.hpp" @@ -93,7 +94,11 @@ class RemoteContextImpl : public ov::IRemoteContext { ov::intel_gpu::gpu_handle_param m_va_display = nullptr; ov::intel_gpu::gpu_handle_param m_external_queue = nullptr; +#ifdef OV_GPU_WITH_ZE_RT + ContextType m_type = ContextType::ZE; +#else ContextType m_type = ContextType::OCL; +#endif std::string m_device_name = ""; static const size_t cache_capacity = 100; cldnn::LruCache m_memory_cache = cldnn::LruCache(cache_capacity); diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp index 66b16345984bc9..9d8bd8aa410e00 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp @@ -8,11 +8,15 @@ # define NOMINMAX #endif + +// Do not include DirectX / VA wrappers when running with L0 runtime as they depend on OCL +#ifndef OV_GPU_WITH_ZE_RT #ifdef _WIN32 # include #else # include #endif +#endif #include "openvino/runtime/iremote_tensor.hpp" #include "intel_gpu/runtime/memory_caps.hpp" diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp index 7c567e877d7552..ef885414c6f1c0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp @@ -33,4 +33,28 @@ struct device { virtual ~device() = default; }; +// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration +// Lower priority value means lower device ID +// Current behavior is: Intel iGPU < Intel dGPU < any other GPU +// Order of Intel dGPUs is undefined and depends on the OCL impl +// Order of other vendor GPUs is undefined and depends on the OCL impl +inline size_t get_device_priority(const cldnn::device_info& info) { + if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) { + return 0; + } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) { + return 1; + } else { + return std::numeric_limits::max(); + } +} + +inline std::vector sort_devices(const std::vector& devices_list) { + std::vector sorted_list = devices_list; + std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1, device::ptr d2) { + return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info()); + }); + + return sorted_list; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 416a72c1a527aa..94e6ff2605a100 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -56,13 +56,13 @@ struct gfx_version { <= std::tie(r.major, r.minor, r.revision); // same order } - bool operator==(const gfx_version& other) { + bool operator==(const gfx_version& other) const { return major == other.major && minor == other.minor && revision == other.revision; } - bool operator!=(const gfx_version& other) { + bool operator!=(const gfx_version& other) const { return !(*this == other); } }; @@ -73,14 +73,14 @@ struct pci_bus_info { uint32_t pci_device = 0; uint32_t pci_function = 0; - bool operator==(const pci_bus_info& other) { + bool operator==(const pci_bus_info& other) const { return pci_domain == other.pci_domain && pci_bus == other.pci_bus && pci_device == other.pci_device && pci_function == other.pci_function; } - bool operator!=(const pci_bus_info& other) { + bool operator!=(const pci_bus_info& other) const { return !(*this == other); } }; @@ -116,9 +116,14 @@ struct device_info { bool supports_imad; ///< Does engine support int8 mad. bool supports_immad; ///< Does engine support int8 multi mad. + bool supports_mutable_command_list; ///< [L0] Does the target runtime/device support mutable command list feature + bool supports_usm; ///< Does engine support unified shared memory. bool has_separate_cache; ///< Does the target hardware has separate cache for usm_device and usm_host + bool supports_cp_offload; ///< [L0] Does the command queue support copy offload + bool supports_counter_based_events; ///< [L0] Does the target runtime support counter based events + std::vector supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler uint32_t vendor_id; ///< Vendor ID @@ -140,8 +145,45 @@ struct device_info { pci_bus_info pci_info; ///< PCI bus information for the device + uint64_t timer_resolution; ///< [L0] Resolution of device timer used for profiling in cycles/sec + uint32_t kernel_timestamp_valid_bits; ///< [L0] Number of valid bits in the kernel timestamp values + uint32_t compute_queue_group_ordinal; ///< [L0] Ordinal of the command queue group to use for compute + uint32_t device_memory_ordinal; ///< [L0] Ordinal of the selected global device memory + ov::device::UUID uuid; ///< UUID of the gpu device ov::device::LUID luid; ///< LUID of the gpu device + + inline bool is_same_device(const device_info &other) const { + // Relying solely on the UUID is not reliable in all the cases (particularly on legacy platforms), + // where the UUID may be missing or incorrectly generated + // Therefore, we also validate other attributes + if (uuid.uuid != other.uuid.uuid) + return false; + + if (pci_info != other.pci_info) + return false; + + if (sub_device_idx != other.sub_device_idx) + return false; + + if (vendor_id != other.vendor_id || + dev_name != other.dev_name || + driver_version != other.driver_version) + return false; + + if (dev_type != other.dev_type || + gfx_ver != other.gfx_ver || + arch != other.arch) + return false; + + if (ip_version != other.ip_version || device_id != other.device_id) + return false; + + if (execution_units_count != other.execution_units_count || max_global_mem_size != other.max_global_mem_size) + return false; + + return true; + } }; /// @} diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp index ecb82795c0d920..d5291f50486152 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_query.hpp @@ -17,6 +17,12 @@ namespace cldnn { struct device_query { public: static int device_id; + /// @brief Get default engine type + static engine_types get_default_engine_type(); + + /// @brief Get default runtime type + static runtime_types get_default_runtime_type(); + explicit device_query(engine_types engine_type, runtime_types runtime_type, void* user_context = nullptr, @@ -25,6 +31,13 @@ struct device_query { int target_tile_id = -1, bool initialize_devices = false); + /// @brief Create device query with default values for engine type and runtime type + explicit device_query(void* user_context = nullptr, + void* user_device = nullptr, + int ctx_device_id = 0, + int target_tile_id = -1, + bool initialize_devices = false); + std::map get_available_devices() const { return _available_devices; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index 357508292d28ec..fa57dd19af6f59 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -12,6 +12,7 @@ #include "layout.hpp" #include "execution_config.hpp" #include "engine_configuration.hpp" +#include "kernel_builder.hpp" #include #include @@ -83,7 +84,7 @@ class engine { /// Checks whether two memory objects represents the same physical memory virtual bool is_the_same_buffer(const memory& mem1, const memory& mem2) = 0; - virtual bool check_allocatable(const layout& layout, allocation_type type) = 0; + virtual bool check_allocatable(const layout& layout, allocation_type type); /// Returns basic allocation type which will be used as a fallback when allocation type is not specified or device doesn't support some features. virtual allocation_type get_default_allocation_type() const = 0; @@ -141,7 +142,9 @@ class engine { virtual stream_ptr create_stream(const ExecutionConfig& config, void *handle) const = 0; /// Returns service stream which can be used during program build and optimizations - virtual stream& get_service_stream() const = 0; + virtual stream& get_service_stream() const; + + virtual std::shared_ptr create_kernel_builder() const = 0; virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0; @@ -154,13 +157,9 @@ class engine { virtual void create_onednn_engine(const ExecutionConfig& config) = 0; /// Returns onednn engine object which shares device and context with current engine - virtual dnnl::engine& get_onednn_engine() const = 0; + virtual dnnl::engine& get_onednn_engine() const; #endif - /// This method is intended to create kernel handle for current engine from handle from arbitrary engine - /// For instance, source kernel can be compiled using ocl engine, and then we can build L0 kernel object based on that - virtual kernel::ptr prepare_kernel(const kernel::ptr kernel) const = 0; - /// Factory method which creates engine object with impl configured by @p engine_type /// @param engine_type requested engine type /// @param runtime_type requested execution runtime for the engine. @note some runtime/engine types configurations might be unsupported @@ -178,6 +177,12 @@ class engine { engine(const device::ptr device); const device::ptr _device; bool enable_large_allocations = false; + std::unique_ptr _service_stream; + +#ifdef ENABLE_ONEDNN_FOR_GPU + std::mutex onednn_mutex; + std::shared_ptr _onednn_engine; +#endif std::array, static_cast(allocation_type::max_value)> _memory_usage_data{}; std::array, static_cast(allocation_type::max_value)> _peak_memory_usage_data{}; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp index abe01b0cc5da22..8eea9df0169ab2 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine_configuration.hpp @@ -4,20 +4,22 @@ #pragma once -#include +#include namespace cldnn { /// @brief Defines available engine types enum class engine_types : int32_t { ocl, - sycl + sycl, + ze }; inline std::ostream& operator<<(std::ostream& os, const engine_types& type) { switch (type) { case engine_types::ocl: os << "ocl"; break; case engine_types::sycl: os << "sycl"; break; + case engine_types::ze: os << "ze"; break; default: os << "unknown"; break; } @@ -27,11 +29,13 @@ inline std::ostream& operator<<(std::ostream& os, const engine_types& type) { /// @brief Defines available runtime types enum class runtime_types : int32_t { ocl, + ze, }; inline std::ostream& operator<<(std::ostream& os, const runtime_types& type) { switch (type) { case runtime_types::ocl: os << "ocl"; break; + case runtime_types::ze: os << "ze"; break; default: os << "unknown"; break; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp index dffa64173fe72c..24465c4f2d0569 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/event.hpp @@ -32,6 +32,16 @@ struct event { _profiling_captured = false; _profiling_info.clear(); } + /// @brief Set event profiling data instead of retrieving it from event object + /// @param duration_nsec duration in nanoseconds + void set_profiling_duration(uint64_t duration_nsec) { + auto stage = instrumentation::profiling_stage::executing; + auto duration = std::chrono::nanoseconds(duration_nsec); + auto period = std::make_shared(duration); + + _profiling_info.push_back({ stage, period }); + _profiling_captured = true; + } // returns true if handler has been successfully added bool add_event_handler(event_handler handler, void* data); diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp index 8dcd9d07d0f246..be273cd1d50aa7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel.hpp @@ -4,11 +4,9 @@ #pragma once -#include "kernel_args.hpp" -#include "event.hpp" - #include #include +#include namespace cldnn { @@ -18,8 +16,15 @@ class kernel { public: using ptr = std::shared_ptr; virtual std::shared_ptr clone(bool reuse_kernel_handle = false) const = 0; + /// @brief Check if objects share the same handle to the kernel instance + /// @param other kernel object for comparison + /// @return true if underlying kernel handles are the same, false otherwise + virtual bool is_same(const kernel &other) const = 0; virtual ~kernel() = default; - virtual std::string get_id() const { return ""; } + + virtual std::string get_id() const = 0; + virtual std::vector get_binary() const = 0; + virtual std::string get_build_log() const = 0; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp new file mode 100644 index 00000000000000..90c17a77be03cf --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_builder.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel.hpp" + +#include +#include +#include + +namespace cldnn { + +enum class KernelFormat { + SOURCE, + NATIVE_BIN, +}; + +class kernel_builder { +public: + virtual void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector &out) const = 0; +}; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index cdb15dbace42d7..9cd00b1c3065e5 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -224,7 +224,6 @@ struct surfaces_lock { surfaces_lock(const surfaces_lock& other) = delete; surfaces_lock& operator=(const surfaces_lock& other) = delete; - static std::unique_ptr create(engine_types engine_type, std::vector mem, const stream& stream); static bool is_lock_needed(const shared_mem_type& mem_type); }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp index 78d4b99b32f99a..4d04792bae0bb0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp @@ -81,6 +81,19 @@ enum class shared_mem_type { shared_mem_usm }; +inline std::ostream& operator<<(std::ostream& out, const shared_mem_type& mem_type) { + switch (mem_type) { + case shared_mem_type::shared_mem_empty: out << "shared_mem_empty"; break; + case shared_mem_type::shared_mem_buffer: out << "shared_mem_buffer"; break; + case shared_mem_type::shared_mem_image: out << "shared_mem_image"; break; + case shared_mem_type::shared_mem_vasurface: out << "shared_mem_vasurface"; break; + case shared_mem_type::shared_mem_dxbuffer: out << "shared_mem_dxbuffer"; break; + case shared_mem_type::shared_mem_usm: out << "shared_mem_usm"; break; + default: out << "unknown"; break; + } + return out; +} + using shared_handle = void*; using shared_surface = uint32_t; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp index 1e8300f92135e7..98f6d87066ac34 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp @@ -17,6 +17,7 @@ #endif namespace cldnn { +struct surfaces_lock; // Possible sync methods for kernels in stream enum class SyncMethods { @@ -66,12 +67,12 @@ class stream { virtual void wait_for_events(const std::vector& events) = 0; virtual event::ptr create_user_event(bool set) = 0; virtual event::ptr create_base_event() = 0; + virtual std::unique_ptr create_surfaces_lock(const std::vector &mem) const = 0; virtual event::ptr aggregate_events(const std::vector& events, bool group = false, bool is_output = false); QueueTypes get_queue_type() const { return m_queue_type; } SyncMethods get_sync_method() const { return m_sync_method; } - static QueueTypes detect_queue_type(engine_types engine_type, void* queue_handle); static SyncMethods get_expected_sync_method(const ExecutionConfig& config); #ifdef ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/graph/CMakeLists.txt b/src/plugins/intel_gpu/src/graph/CMakeLists.txt index df2a3257d6ea7f..80bf20a8273274 100644 --- a/src/plugins/intel_gpu/src/graph/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/graph/CMakeLists.txt @@ -84,6 +84,10 @@ macro(ov_gpu_add_backend_target) target_include_directories(${ARG_NAME} SYSTEM BEFORE PRIVATE $) add_dependencies(openvino_intel_gpu_${IMPL_TYPE}_obj onednn_gpu_tgt) endif() + # Onednn headers use OCL/L0 headers + ov_gpu_set_runtime_interface_for(openvino_intel_gpu_${IMPL_TYPE}_obj) + # Onednn needs OCL headers even when running L0 + target_link_libraries(openvino_intel_gpu_${IMPL_TYPE}_obj PRIVATE OpenCL::OpenCL) endmacro() set(CODEGEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/common_utils/kernels_db_gen.py") @@ -135,6 +139,9 @@ ov_build_target_faster(${TARGET_NAME} PCH) target_compile_options(${TARGET_NAME} PRIVATE ${COMMON_COMPILE_OPTIONS}) ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) +# Onednn needs OCL headers even when running L0 +target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp index 4b05604f2da89f..fa00107fd4e773 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp @@ -9,6 +9,7 @@ #include "fully_connected_inst.h" #include "assign_inst.h" #include "mvn_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp index e2b53fe9eafcbc..9e06c71e124da5 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp @@ -5,6 +5,7 @@ #include "pass_manager.h" #include "program_helpers.h" #include "reshape_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp index 5a28094d5d86c9..b96acc4d66a1a2 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp @@ -12,7 +12,7 @@ #include "shape_of_inst.h" #include "broadcast_inst.h" #include "non_zero_inst.h" -#include "non_max_suppression_inst.h" +#include "reorder_inst.h" #include "unique_inst.hpp" #include "scatter_elements_update_inst.h" #include "scatter_update_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index 1566d0ad20a7f8..e521139b62f34c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -9,6 +9,7 @@ #include "convolution_inst.h" #include "deconvolution_inst.h" #include "fully_connected_inst.h" +#include "reorder_inst.h" #include "lstm_seq_inst.h" #include "gru_seq_inst.h" #include "intel_gpu/runtime/format.hpp" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index da21f1d2220c16..6e3537f8eccd4f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -11,6 +11,7 @@ #include "crop_inst.h" #include "eltwise_inst.h" #include "gemm_inst.h" +#include "reorder_inst.h" #include "assign_inst.h" #include "read_value_inst.h" #include "reshape_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 0b7a105c1d7c99..7ae646ec69510f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -12,6 +12,7 @@ #include "activation_inst.h" #include "batch_to_space_inst.h" #include "crop_inst.h" +#include "reorder_inst.h" #include "eltwise_inst.h" #include "gemm_inst.h" #include "lrn_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp index 21801247ebbbe5..7737c50e585477 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing_through.cpp @@ -10,6 +10,7 @@ #include "data_inst.h" #include "eltwise_inst.h" #include "mutable_data_inst.h" +#include "reorder_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index f873edf9a306ec..fc2bfe638e5a75 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -22,6 +22,7 @@ #include "fully_connected_inst.h" #include "group_normalization_inst.h" #include "mvn_inst.h" +#include "reorder_inst.h" #include "rms_inst.h" #include diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp index 92cfe47926fcf0..a1f5c0f1ac1b63 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp @@ -20,10 +20,6 @@ #include "intel_gpu/runtime/itt.hpp" #include "intel_gpu/runtime/file_util.hpp" -#include "ocl/ocl_kernel.hpp" -#include "ocl/ocl_common.hpp" -#include "ocl/ocl_device.hpp" - #ifdef WIN32 #include #ifdef NTDDI_WIN10_RS5 @@ -54,41 +50,18 @@ namespace { std::mutex cacheAccessMutex; -static const cldnn::device::ptr get_target_device(const cldnn::engine& engine) { - using namespace cldnn; - if (engine.runtime_type() == runtime_types::ocl) { - return engine.get_device(); - } else { - ocl::ocl_device_detector detector; - auto device_map = detector.get_available_devices(nullptr, nullptr); - auto original_device = engine.get_device(); - - for (auto& d : device_map) { - const auto& target_uuid = d.second->get_info().uuid; - const auto& original_uuid = original_device->get_info().uuid; - if (target_uuid.uuid == original_uuid.uuid) - return d.second; - } +std::string join_strings(const std::vector strings) { + size_t total_size = 0; + for (auto &str : strings) { + total_size += str.size(); } - - OPENVINO_THROW("[GPU] Couldn't find target device for kernels cache"); -} - -#ifdef ENABLE_ONEDNN_FOR_GPU -cl::Program fuse_microkernels(const cl::Context& context, const cl::Device& device, cl::Program& program, const std::string& code) { - using namespace dnnl::impl::gpu::intel; - std::vector> binaries = program.getInfo(); - OPENVINO_ASSERT(binaries.size() == 1); - std::vector binary = binaries[0]; - micro::fuseMicrokernels(binary, code.c_str()); - - cl::Program::Binaries fused_binary = { binary }; - cl::Program fused_program(context, {device}, fused_binary); - fused_program.build({device}); - - return fused_program; + std::string acc_str; + acc_str.reserve(total_size); + for (auto &str : strings) { + acc_str.append(str); + } + return acc_str; } -#endif // ENABLE_ONEDNN_FOR_GPU std::string reorder_options(const std::string& org_options) { std::stringstream ss(org_options); @@ -307,42 +280,21 @@ kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, std::shared_ptr task_executor, const std::map& batch_headers) - : _device(get_target_device(engine)) + : _device(engine.get_device()) + , _builder(engine.create_kernel_builder()) , _task_executor(task_executor) , _config(config) , _prog_id(prog_id) , batch_headers(std::move(batch_headers)) { } -static std::vector getProgramBinaries(cl::Program program) { - // Get the size of the program binary in bytes. - std::vector binary_sizes = program.getInfo(); - - if (binary_sizes.size() != 1) - throw std::runtime_error("Invalid binaries count"); - - size_t binary_size = binary_sizes.front(); - // Binary is not available for the device. - if (binary_size == 0) - throw std::runtime_error("Binary is not avaliable after program build"); - - // Get program binary. - return program.getInfo().front(); -} - -// TODO: This build_batch method should be backend specific void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& compiled_kernels) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch"); - - auto& cl_build_device = dynamic_cast(*_device); - bool dump_sources = batch.dump_custom_program; std::string dump_sources_dir = GPU_DEBUG_VALUE_OR(_config.get_dump_sources_path(), ""); GPU_DEBUG_IF(!dump_sources_dir.empty()) { dump_sources = true; } - std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which - std::string current_dump_file_name = ""; if (dump_sources) { current_dump_file_name = std::move(dump_sources_dir); @@ -361,128 +313,71 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co dump_file << s; } } - std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache"; - cl::Program::Binaries precompiled_kernels = {}; - + /////////////////////////////////////////////////////////////////////////////////// + std::vector precompiled; if (is_cache_enabled()) { - // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket - // If read is successful, then remove kernels from compilation bucket - std::vector bin; - { - std::lock_guard lock(cacheAccessMutex); - bin = ov::util::load_binary(cached_bin_name); - } - if (!bin.empty()) { - precompiled_kernels.push_back(bin); - } + std::lock_guard lock(cacheAccessMutex); + precompiled = ov::util::load_binary(cached_bin_name); } - try { - cl::vector kernels; - - // Run compilation - if (precompiled_kernels.empty()) { - cl::Program program(cl_build_device.get_context(), batch.source); - { - OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildProgram::RunCompilation"); - if (program.build({cl_build_device.get_device()}, batch.options.c_str()) != CL_SUCCESS) - throw std::runtime_error("Failed in building program."); - } - - if (dump_sources && dump_file.good()) { - dump_file << "\n/* Build Log:\n"; - for (auto& p : program.getBuildInfo()) - dump_file << p.second << "\n"; - - dump_file << "*/\n"; + std::vector kernels; + if (!precompiled.empty()) { + _builder->build_kernels(precompiled.data(), precompiled.size(), KernelFormat::NATIVE_BIN, "", kernels); + } else { + auto combined_source = join_strings(batch.source); + _builder->build_kernels(combined_source.data(), combined_source.size(), KernelFormat::SOURCE, batch.options, kernels); + if (dump_sources && dump_file.good()) { + dump_file << "\n/* Build Log:\n"; + // Retreive build log from the first kernel only + // It should be the same for all kernels in batch + if (kernels.size() > 1) { + dump_file << kernels[0]->get_build_log(); } - - if (batch.has_microkernels) { + dump_file << "\n*/\n"; + } + if (batch.has_microkernels) { #ifdef ENABLE_ONEDNN_FOR_GPU - OPENVINO_ASSERT(batch.kernels_counter == 1); - // Do we need full source code here (with batch headers)? - program = fuse_microkernels(cl_build_device.get_context(), cl_build_device.get_device(), program, batch.source.back()); + using namespace dnnl::impl::gpu::intel; + OPENVINO_ASSERT(batch.kernels_counter == 1 && kernels.size() == 1); + std::vector binary = kernels[0]->get_binary(); + kernels.clear(); + // Update binary and rebuild kernel + micro::fuseMicrokernels(binary, combined_source.c_str()); + _builder->build_kernels(binary.data(), binary.size(), KernelFormat::NATIVE_BIN, "", kernels); #else // ENABLE_ONEDNN_FOR_GPU - OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available"); + OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available"); #endif // ENABLE_ONEDNN_FOR_GPU - } - - - program.createKernels(&kernels); - - if (is_cache_enabled()) { + } + if (is_cache_enabled()) { // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited // Bucket size can be changed by max_kernels_per_batch config option, but forcing it to 1 will lead to much longer // compile time. + std::vector binary = kernels[0]->get_binary(); std::lock_guard lock(cacheAccessMutex); - ov::intel_gpu::save_binary(cached_bin_name, getProgramBinaries(std::move(program))); - } - } else { - cl::Program program(cl_build_device.get_context(), {cl_build_device.get_device()}, precompiled_kernels); - if (program.build({cl_build_device.get_device()}, batch.options.c_str()) != CL_SUCCESS) - throw std::runtime_error("Failed in building program with a precompiled kernel."); - - program.createKernels(&kernels); + ov::intel_gpu::save_binary(cached_bin_name, binary); } - - { - std::lock_guard lock(_mutex); - for (auto& k : kernels) { - const auto& entry_point = k.getInfo(); - const auto& iter = batch.entry_point_to_id.find(entry_point); - if (iter != batch.entry_point_to_id.end()) { - kernel::ptr kernel = std::make_shared(ocl::ocl_kernel_type(k, cl_build_device.get_usm_helper()), entry_point); - - auto& params = iter->second.first; - auto kernel_part_idx = iter->second.second; - if (compiled_kernels.find(params) != compiled_kernels.end()) { - compiled_kernels[params].push_back(std::make_pair(kernel, kernel_part_idx)); - } else { - compiled_kernels[params] = { std::make_pair(kernel, kernel_part_idx) }; - } - if (_kernel_batch_hash.find(params) == _kernel_batch_hash.end()) { - _kernel_batch_hash[params] = batch.hash_value; - } + } + { + std::lock_guard lock(_mutex); + for (auto& k : kernels) { + auto entry_point = k->get_id(); + const auto& iter = batch.entry_point_to_id.find(entry_point); + if (iter != batch.entry_point_to_id.end()) { + auto& params = iter->second.first; + auto kernel_part_idx = iter->second.second; + if (compiled_kernels.find(params) != compiled_kernels.end()) { + compiled_kernels[params].push_back(std::make_pair(k, kernel_part_idx)); } else { - throw std::runtime_error("Could not find entry point"); + compiled_kernels[params] = { std::make_pair(k, kernel_part_idx) }; } + if (_kernel_batch_hash.find(params) == _kernel_batch_hash.end()) { + _kernel_batch_hash[params] = batch.hash_value; + } + } else { + throw std::runtime_error("Could not find entry point"); } } - } catch (const cl::BuildError& err) { - if (dump_sources && dump_file.good()) - dump_file << "\n/* Build Log:\n"; - - for (auto& p : err.getBuildLog()) { - if (dump_sources && dump_file.good()) - dump_file << p.second << "\n"; - err_log += p.second + '\n'; - } - if (dump_sources && dump_file.good()) - dump_file << "*/\n"; - } - if (!err_log.empty()) { - GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl; - GPU_DEBUG_INFO << err_log << std::endl; - GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl; - std::stringstream err_ss(err_log); - std::string line; - std::stringstream err; - int cnt = 0; - - while (std::getline(err_ss, line, '\n')) { - if (line.find("error") != std::string::npos) - cnt = 5; - cnt--; - if (cnt > 0) - err << line << std::endl; - else if (cnt == 0) - err << "...." << std::endl; - } - - throw std::runtime_error("Program build failed(" + std::to_string(batch.bucket_id) + + "_part_" - + std::to_string(batch.batch_id) - + "):\n" + err.str()); } } @@ -504,53 +399,15 @@ std::vector kernels_cache::get_kernels(const kernel_impl_params& pa OPENVINO_ASSERT(_kernels.end() != res, "Kernel for {" + current_node_id + "} is not found in the kernel cache!"); OPENVINO_ASSERT(res->second.size() != 0, "Number of kernels should not be zero for " + current_node_id); - auto& engine = params.get_program().get_engine(); - std::vector kernels(res->second.size()); for (auto& k : res->second) { auto& kernel_ptr = k.first; auto kernel_part_idx = k.second; - kernels[kernel_part_idx] = engine.prepare_kernel(kernel_ptr->clone(_reuse_kernels)); + kernels[kernel_part_idx] = kernel_ptr->clone(_reuse_kernels); } return kernels; } -bool kernels_cache::validate_simple_kernel_execution(kernel::ptr krl) { - auto casted = downcast(krl.get()); - auto kernel = casted->get_handle(); - try { - auto casted_dev = dynamic_cast(_device.get()); - OPENVINO_ASSERT(casted_dev != nullptr, "device is nullptr"); - - auto device = casted_dev->get_device(); - cl::Context ctx(device); - - cl::Buffer buffer(ctx, CL_MEM_READ_WRITE, sizeof(uint8_t) * 8); - if (kernel.setArg(0, buffer) != CL_SUCCESS) - return false; - - cl::Event ev; - cl::CommandQueue queue(ctx, device); - if (queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(8), cl::NDRange(8), nullptr, &ev) != CL_SUCCESS) - return false; - - uint8_t result[8]; - uint8_t expected[8] = { 1, 3, 5, 7, 9, 11, 13, 15 }; - if (queue.enqueueReadBuffer(buffer, CL_TRUE, 0, sizeof(uint8_t) * 8, &result) != CL_SUCCESS) - return false; - - for (int i = 0; i < 8; ++i) { - if (result[i] != expected[i]) - return false; - } - - ev.wait(); - return true; - } catch (...) { - return false; - } -} - void kernels_cache::build_all() { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll"); if (!_pending_compilation) @@ -635,15 +492,12 @@ void kernels_cache::add_kernels_source(const kernel_impl_params& params, } std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const { - auto ocl_kernel = std::static_pointer_cast(kernel); - const auto& entry_point = ocl_kernel->get_handle().getInfo(); - auto program = ocl_kernel->get_handle().getInfo(); - cl::vector program_binaries = getProgramBinaries(std::move(program)); + auto program_binaries = kernel->get_binary(); auto iter = _cached_binaries.find(program_binaries); OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries"); - return entry_point + "@" + std::to_string(iter->second); + return kernel->get_id() + "@" + std::to_string(iter->second); } std::vector kernels_cache::get_cached_kernel_ids(const std::vector& kernels) const { @@ -661,9 +515,7 @@ void kernels_cache::add_to_cached_kernels(const std::vector& kernel static std::atomic id_gen{0}; for (auto& kernel : kernels) { - auto ocl_kernel = std::static_pointer_cast(kernel); - auto program = ocl_kernel->get_handle().getInfo(); - cl::vector program_binaries = getProgramBinaries(std::move(program)); + auto program_binaries = kernel->get_binary(); std::lock_guard lock(_mutex); auto iter = _cached_binaries.find(program_binaries); @@ -698,7 +550,7 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const { ob << cached_binary.first; ob << is_zebin_binary; if (!is_zebin_binary) { - auto driver_version = downcast(*_device).get_info().driver_version; + auto driver_version = _device->get_info().driver_version; ob << driver_version; } } @@ -707,8 +559,6 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const { void kernels_cache::load(BinaryInputBuffer& ib) { std::unordered_map> precompiled_kernels; - const auto& build_device = downcast(*_device); - size_t num_cached_binaries; ib >> num_cached_binaries; for (size_t i = 0; i < num_cached_binaries; ++i) { @@ -722,7 +572,7 @@ void kernels_cache::load(BinaryInputBuffer& ib) { // Legacy patchtoken path std::string driver_version, current_driver_version; ib >> driver_version; - current_driver_version = build_device.get_info().driver_version; + current_driver_version = _device->get_info().driver_version; if (driver_version != current_driver_version) { OPENVINO_THROW("Driver version mismatch in cached patchtoken kernels"); @@ -730,31 +580,22 @@ void kernels_cache::load(BinaryInputBuffer& ib) { } } - try { + { std::lock_guard lock(_mutex); _cached_kernels.clear(); for (auto& precompiled_kernel : precompiled_kernels) { - cl::vector kernels; - cl::Program program(build_device.get_context(), {build_device.get_device()}, {precompiled_kernel.second}); - program.build({build_device.get_device()}); - program.createKernels(&kernels); - + std::vector kernels; + _builder->build_kernels(precompiled_kernel.second.data(), precompiled_kernel.second.size(), KernelFormat::NATIVE_BIN, "", kernels); for (auto& k : kernels) { - const auto& entry_point = k.getInfo(); + const auto& entry_point = k->get_id(); std::string cached_kernel_id = entry_point + "@" + std::to_string(precompiled_kernel.first); const auto& iter = _cached_kernels.find(cached_kernel_id); if (iter == _cached_kernels.end()) { - _cached_kernels[cached_kernel_id] = std::make_shared(ocl::ocl_kernel_type(k, build_device.get_usm_helper()), entry_point); + _cached_kernels[cached_kernel_id] = k; } } } - } catch (const cl::BuildError& err) { - std::string err_log = ""; - for (auto& p : err.getBuildLog()) { - err_log += p.second + '\n'; - } - OPENVINO_THROW(err_log); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp index dc77442fedef47..3b6a5cf78032dd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.hpp @@ -7,6 +7,7 @@ #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/runtime/device.hpp" #include "intel_gpu/runtime/kernel.hpp" +#include "intel_gpu/runtime/kernel_builder.hpp" #include "intel_gpu/runtime/execution_config.hpp" #include "intel_gpu/graph/kernel_impl_params.hpp" @@ -97,6 +98,7 @@ class kernels_cache { private: static std::mutex _mutex; const device::ptr _device; + std::shared_ptr _builder; std::shared_ptr _task_executor; ExecutionConfig _config; uint32_t _prog_id = 0; @@ -127,8 +129,6 @@ class kernels_cache { void set_kernels_reuse(bool reuse_kernels) { _reuse_kernels = reuse_kernels; } bool get_kernels_reuse() const { return _reuse_kernels; } - bool validate_simple_kernel_execution(kernel::ptr kernel); - // forces compilation of all pending kernels/programs void build_all(); void reset(); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 46af62be2069f8..8ff102848b0dd4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -139,7 +139,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { if (is_cpu()) { return; } - _kernels.clear(); if (!_kernel_data.kernels.empty()) { auto compiled_kernels = kernels_cache.get_kernels(params); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp index ce5701f2739cf2..08d254b65d7d62 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.hpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // + +#include "reorder_inst.h" #include "registry/implementation_manager.hpp" #include "intel_gpu/primitives/reorder.hpp" #include "program_node.h" diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index 7094ddcf1d68c5..f5b0158c747844 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -12,7 +12,6 @@ #ifdef ENABLE_ONEDNN_FOR_GPU # include # include -# include # include # include # include diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index 5ac39d3c13968a..33d87ea1c394db 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -12,7 +12,6 @@ #include "intel_gpu/runtime/file_util.hpp" #include "to_string_utils.h" #include "utils.hpp" -#include "runtime/ocl/ocl_event.hpp" #include "intel_gpu/primitives/reorder.hpp" @@ -543,8 +542,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { try { _prim.execute(stream.get_onednn_stream(), _args[net_id]); } catch (dnnl::error& err) { - auto err_code = err.status == dnnl_status_t::dnnl_out_of_memory ? CL_OUT_OF_RESOURCES : CL_INVALID_OPERATION; - ocl::rethrow(err.what(), err_code, _engine->get_device_info()); + OPENVINO_THROW(err.what()); } if (_enable_profiling) { @@ -553,12 +551,11 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { stream.wait(); std::vector duration = dnnl::get_profiling_data(stream.get_onednn_stream(), dnnl::profiling_data_kind::time); - if (duration.empty()) { - event = std::make_shared(0); - } else { + event = stream.create_user_event(true); + if (!duration.empty()) { OPENVINO_ASSERT(duration.size() == 1, "[GPU] oneDNN profiling data is expected to have info only for single primitive ", "actual number is ", duration.size()); - event = std::make_shared(duration[0]); + event->set_profiling_duration(duration[0]); } } else { diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index dff786a9853098..b095d68c2d108a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -5,7 +5,6 @@ #include "utils.hpp" #include #include -#include namespace cldnn { namespace onednn { diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 469d63e331cd56..e89a463530aea1 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -715,13 +715,13 @@ std::map network::execute(const std::vector) diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 14a2a08f63d0d6..e8c18242695624 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -217,12 +217,7 @@ Plugin::Plugin() { set_device_name("GPU"); register_primitives(); - // Set OCL runtime which should be always available -#ifdef OV_GPU_WITH_SYCL - cldnn::device_query device_query(cldnn::engine_types::sycl, cldnn::runtime_types::ocl); -#else - cldnn::device_query device_query(cldnn::engine_types::ocl, cldnn::runtime_types::ocl); -#endif + cldnn::device_query device_query; m_device_map = device_query.get_available_devices(); // Set default configs for each device diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp index 885ceb9bda4720..63c8bd043be974 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp @@ -65,12 +65,9 @@ RemoteContextImpl::RemoteContextImpl(const std::mapget_info().dev_name << ")" << std::endl; -#ifdef OV_GPU_WITH_SYCL - const auto engine_type = cldnn::engine_types::sycl; -#else - const auto engine_type = cldnn::engine_types::ocl; -#endif - const auto runtime_type = cldnn::runtime_types::ocl; - m_device->initialize(); // Initialize associated device before use - m_engine = cldnn::engine::create(engine_type, runtime_type, m_device); + m_engine = cldnn::engine::create( + cldnn::device_query::get_default_engine_type(), cldnn::device_query::get_default_runtime_type(), m_device); init_properties(); diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp index c2524c8ab3d1a9..9faecbbb6842e4 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp @@ -317,7 +317,15 @@ void RemoteTensorImpl::allocate() { switch (m_mem_type) { case TensorType::BT_BUF_INTERNAL: { - m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset); + // BT_BUF_INTERNAL should map to cl_mem however L0 engine can not allocate cl_mem + if (engine.supports_allocation(cldnn::allocation_type::cl_mem)) { + m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset); + } else { + // Fall back to usm_host and override memory type + GPU_DEBUG_COUT << "[Warning] [GPU] Could not allocate cl_mem, using usm_host allocation instead\n"; + m_mem_type = TensorType::BT_USM_HOST_INTERNAL; + m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset); + } break; } case TensorType::BT_USM_HOST_INTERNAL: { diff --git a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt index 8daa9444a97304..e052e427dc90df 100644 --- a/src/plugins/intel_gpu/src/runtime/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/runtime/CMakeLists.txt @@ -16,15 +16,25 @@ file(GLOB LIBRARY_SOURCES_OCL "${CMAKE_CURRENT_SOURCE_DIR}/ocl/*.cpp" ) -set(LIBRARY_SOURCES_ALL - ${LIBRARY_SOURCES_MAIN} - ${LIBRARY_SOURCES_OCL} - ) - +file(GLOB LIBRARY_SOURCES_ZE + "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ze/*.cpp" +) file(GLOB_RECURSE SYCL_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/ocl/sycl_*.cpp") if(NOT OV_COMPILER_IS_INTEL_LLVM) - list(REMOVE_ITEM LIBRARY_SOURCES_ALL ${SYCL_SOURCES}) + list(REMOVE_ITEM LIBRARY_SOURCES_OCL ${SYCL_SOURCES}) +endif() + +set(LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_MAIN}) + +if(GPU_RT_TYPE STREQUAL "OCL") + list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_OCL}) +endif() + +if(GPU_RT_TYPE STREQUAL "L0") + list(APPEND LIBRARY_SOURCES_ALL ${LIBRARY_SOURCES_ZE}) endif() add_library(${TARGET_NAME} STATIC ${LIBRARY_SOURCES_ALL}) @@ -58,6 +68,7 @@ if(ENABLE_ONEDNN_FOR_GPU) endif() ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL diff --git a/src/plugins/intel_gpu/src/runtime/device_query.cpp b/src/plugins/intel_gpu/src/runtime/device_query.cpp index 6b6ce3787cf73b..5e6ba1576a17c3 100644 --- a/src/plugins/intel_gpu/src/runtime/device_query.cpp +++ b/src/plugins/intel_gpu/src/runtime/device_query.cpp @@ -4,12 +4,50 @@ #include "intel_gpu/runtime/device_query.hpp" #include "ocl/ocl_device_detector.hpp" +#include "ze/ze_device_detector.hpp" #include -#include namespace cldnn { int device_query::device_id = -1; + +engine_types device_query::get_default_engine_type() { + auto engine_type = engine_types::ocl; +#ifdef OV_GPU_WITH_ZE_RT + engine_type = engine_types::ze; +#endif +#ifdef OV_GPU_WITH_OCL_RT + engine_type = engine_types::ocl; +#endif +#ifdef OV_GPU_WITH_SYCL + engine_type = engine_types::sycl; +#endif + return engine_type; +} +runtime_types device_query::get_default_runtime_type() { + auto rt_type = runtime_types::ocl; +#ifdef OV_GPU_WITH_ZE_RT + rt_type = runtime_types::ze; +#endif +#ifdef OV_GPU_WITH_OCL_RT + rt_type = runtime_types::ocl; +#endif + return rt_type; +} + +device_query::device_query(void* user_context, + void* user_device, + int ctx_device_id, + int target_tile_id, + bool initialize_devices) + : device_query(get_default_engine_type(), + get_default_runtime_type(), + user_context, + user_device, + ctx_device_id, + target_tile_id, + initialize_devices) {} + device_query::device_query(engine_types engine_type, runtime_types runtime_type, void* user_context, @@ -17,17 +55,24 @@ device_query::device_query(engine_types engine_type, int ctx_device_id, int target_tile_id, bool initialize_devices) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: { - if (runtime_type != runtime_types::ocl) - throw std::runtime_error("Unsupported runtime type for ocl engine"); - + switch (runtime_type) { +#ifdef OV_GPU_WITH_OCL_RT + case runtime_types::ocl: { + OPENVINO_ASSERT(engine_type == engine_types::ocl || engine_type == engine_types::sycl); ocl::ocl_device_detector ocl_detector; _available_devices = ocl_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices); break; } - default: throw std::runtime_error("Unsupported engine type in device_query"); +#endif +#ifdef OV_GPU_WITH_ZE_RT + case runtime_types::ze: { + OPENVINO_ASSERT(engine_type == engine_types::ze); + ze::ze_device_detector ze_detector; + _available_devices = ze_detector.get_available_devices(user_context, user_device, ctx_device_id, target_tile_id, initialize_devices); + break; + } +#endif + default: OPENVINO_THROW("[GPU] Unsupported engine/runtime types in device_query"); } } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index 7cbf0ed7e77020..959fb414a51c68 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -10,6 +10,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "ocl/ocl_engine_factory.hpp" +#include "ze/ze_engine_factory.hpp" #include #include @@ -262,9 +263,16 @@ std::shared_ptr engine::create(engine_types engine_type, runtime_ ret = ocl::create_sycl_engine(device, runtime_type); break; #endif // OV_GPU_WITH_SYCL +#ifdef OV_GPU_WITH_OCL_RT case engine_types::ocl: ret = ocl::create_ocl_engine(device, runtime_type); break; +#endif +#ifdef OV_GPU_WITH_ZE_RT + case engine_types::ze: + ret = ze::create_ze_engine(device, runtime_type); + break; +#endif default: throw std::runtime_error("Invalid engine type"); } @@ -286,4 +294,60 @@ std::shared_ptr engine::create(engine_types engine_type, runtime_ return engine::create(engine_type, runtime_type, device); } +bool engine::check_allocatable(const layout& layout, allocation_type type) { + OPENVINO_ASSERT(supports_allocation(type), "[GPU] Unsupported allocation type: ", type); + + if (!get_enable_large_allocations()) { + bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size); + + // When dynamic shape upper bound makes bigger buffer, then return false. + if (exceed_allocatable_mem_size && layout.is_dynamic()) { + OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); + return false; + } + + OPENVINO_ASSERT(!exceed_allocatable_mem_size, + "[GPU] Exceeded max size of memory object allocation: ", + "requested ", layout.bytes_count(), " bytes, " + "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.", + "Please try to reduce batch size or use lower precision."); + } + + auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); + auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size()); + + // When dynamic shape upper bound makes bigger buffer, then return false. + if (exceed_available_mem_size && layout.is_dynamic()) { + OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); + return false; + } + +#ifdef __unix__ + // Prevent from being killed by Ooo Killer of Linux + OPENVINO_ASSERT(!exceed_available_mem_size, + "[GPU] Exceeded max size of memory allocation: ", + "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", + "but available memory size is ", get_max_memory_size(), " bytes"); +#else + if (exceed_available_mem_size) { + GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " + << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; + GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; + } +#endif + + return true; +} + +#ifdef ENABLE_ONEDNN_FOR_GPU +dnnl::engine& engine::get_onednn_engine() const { + OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called"); + return *_onednn_engine; +} +#endif + +stream& engine::get_service_stream() const { + return *_service_stream; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp index 117d934d0d7d0c..46904030b6ee52 100644 --- a/src/plugins/intel_gpu/src/runtime/memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/memory.cpp @@ -7,8 +7,6 @@ #include "intel_gpu/runtime/stream.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" -#include "ocl/ocl_memory.hpp" - #include #include #include @@ -45,15 +43,6 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, std:: : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), m_mem_tracker(mem_tracker), _type(type) { } -std::unique_ptr surfaces_lock::create(engine_types engine_type, std::vector mem, const stream& stream) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: - return std::unique_ptr(new ocl::ocl_surfaces_lock(mem, stream)); - default: throw std::runtime_error("Unsupported engine type in surfaces_lock::create"); - } -} - bool surfaces_lock::is_lock_needed(const shared_mem_type& mem_type) { return mem_type == shared_mem_type::shared_mem_vasurface || mem_type == shared_mem_type::shared_mem_dxbuffer || diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index ca0193f85e1d3c..d935b6ad900e54 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -2,12 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef ENABLE_ONEDNN_FOR_GPU +#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) #ifndef NOMINMAX # define NOMINMAX #endif #include "gpu/intel/jit/generator.hpp" -#endif // ENABLE_ONEDNN_FOR_GPU +#endif #include "ocl_device.hpp" #include "ocl_common.hpp" @@ -51,7 +51,7 @@ namespace ocl { namespace { -#ifdef ENABLE_ONEDNN_FOR_GPU +#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { switch (gpu_arch) { case ngen::HW::Gen9: return gpu_arch::gen9; @@ -344,8 +344,17 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex info.num_ccs = std::max(num_queues, info.num_ccs); } + info.supports_mutable_command_list = false; + + // Not supported + info.timer_resolution = 0; + info.kernel_timestamp_valid_bits = 0; + info.compute_queue_group_ordinal = 0; + info.device_memory_ordinal = 0; + info.supports_cp_offload = false; + info.supports_counter_based_events = false; -#ifdef ENABLE_ONEDNN_FOR_GPU +#if defined(ENABLE_ONEDNN_FOR_GPU) && defined(OV_GPU_WITH_OCL_RT) using namespace dnnl::impl::gpu::intel::jit; if (context.get() != nullptr) { ngen::Product product = ngen::OpenCLCodeGenerator::detectHWInfo(context.get(), device.get()); @@ -377,6 +386,7 @@ bool does_device_support(int32_t param, const cl::Device& device) { memory_capabilities init_memory_caps(const cl::Device& device, const device_info& info) { std::vector memory_caps; + memory_caps.push_back(allocation_type::cl_mem); if (info.supports_usm) { if (does_device_support(CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, device)) { memory_caps.push_back(allocation_type::usm_host); @@ -428,36 +438,7 @@ bool ocl_device::is_same(const device::ptr other) { // Short path if cl_device is the same if (_platform == casted->_platform && _device.get() && casted->_device.get() && _device == casted->_device) return true; - - // Relying solely on the UUID is not reliable in all the cases (particularly on legacy platforms), - // where the UUID may be missing or incorrectly generated - // Therefore, we also validate other attributes - if (_info.uuid.uuid != casted->_info.uuid.uuid) - return false; - - if (_info.pci_info != casted->_info.pci_info) - return false; - - if (_info.sub_device_idx != casted->_info.sub_device_idx) - return false; - - if (_info.vendor_id != casted->_info.vendor_id || - _info.dev_name != casted->_info.dev_name || - _info.driver_version != casted->_info.driver_version) - return false; - - if (_info.dev_type != casted->_info.dev_type || - _info.gfx_ver != casted->_info.gfx_ver || - _info.arch != casted->_info.arch) - return false; - - if (_info.ip_version != casted->_info.ip_version || _info.device_id != casted->_info.device_id) - return false; - - if (_info.execution_units_count != casted->_info.execution_units_count || _info.max_global_mem_size != casted->_info.max_global_mem_size) - return false; - - return true; + return _info.is_same_device(casted->_info); } void ocl_device::set_mem_caps(const memory_capabilities& memory_capabilities) { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp index 11dd1ea71d0a14..6f3443fe2163ea 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp @@ -69,20 +69,6 @@ bool does_device_match_config(const cl::Device& device) { return true; } -// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration -// Lower priority value means lower device ID -// Current behavior is: Intel iGPU < Intel dGPU < any other GPU -// Order of Intel dGPUs is undefined and depends on the OCL impl -// Order of other vendor GPUs is undefined and depends on the OCL impl -size_t get_device_priority(const cldnn::device_info& info) { - if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) { - return 0; - } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) { - return 1; - } else { - return std::numeric_limits::max(); - } -} } // namespace namespace cldnn { @@ -133,15 +119,6 @@ static std::vector getSubDevices(cl::Device& rootDevice) { return subDevices; } -std::vector ocl_device_detector::sort_devices(const std::vector& devices_list) { - std::vector sorted_list = devices_list; - std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1, device::ptr d2) { - return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info()); - }); - - return sorted_list; -} - std::map ocl_device_detector::get_available_devices(void* user_context, void* user_device, int ctx_device_id, diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp index 121fc025098b08..0992d734bf512b 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp @@ -25,8 +25,6 @@ class ocl_device_detector { int target_tile_id = -1, bool initialize_devices = false) const; - static std::vector sort_devices(const std::vector& devices_list); - private: std::vector create_device_list() const; std::vector create_device_list_from_user_context(void* user_context, int ctx_device_id = 0) const; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 35cef4bafb88a0..6e45fdd788a109 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -4,7 +4,8 @@ #include "ocl_engine.hpp" #include "intel_gpu/runtime/utils.hpp" -#include "ocl/ocl_kernel.hpp" +#include "ocl_kernel.hpp" +#include "ocl_kernel_builder.hpp" #include "ocl_common.hpp" #include "ocl_memory.hpp" #include "ocl_stream.hpp" @@ -63,15 +64,13 @@ void ocl_engine::create_onednn_engine(const ExecutionConfig& config) { if (!_onednn_engine) { auto casted = std::dynamic_pointer_cast(_device); OPENVINO_ASSERT(casted, "[GPU] Invalid device type stored in ocl_engine"); - +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get())); +#endif } } - -dnnl::engine& ocl_engine::get_onednn_engine() const { - OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called"); - return *_onednn_engine; -} #endif const cl::Context& ocl_engine::get_cl_context() const { @@ -97,51 +96,6 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const : allocation_type::unknown; } -bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) { - OPENVINO_ASSERT(supports_allocation(type) || type == allocation_type::cl_mem, "[GPU] Unsupported allocation type: ", type); - - if (!get_enable_large_allocations()) { - bool exceed_allocatable_mem_size = (layout.bytes_count() > get_device_info().max_alloc_mem_size); - - // When dynamic shape upper bound makes bigger buffer, then return false. - if (exceed_allocatable_mem_size && layout.is_dynamic()) { - OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); - return false; - } - - OPENVINO_ASSERT(!exceed_allocatable_mem_size, - "[GPU] Exceeded max size of memory object allocation: ", - "requested ", layout.bytes_count(), " bytes, " - "but max alloc size supported by device is ", get_device_info().max_alloc_mem_size, " bytes.", - "Please try to reduce batch size or use lower precision."); - } - - auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); - auto exceed_available_mem_size = (layout.bytes_count() + used_mem > get_max_memory_size()); - - // When dynamic shape upper bound makes bigger buffer, then return false. - if (exceed_available_mem_size && layout.is_dynamic()) { - OPENVINO_ASSERT(layout.has_upper_bound(), "[GPU] Dynamic shape without upper bound tries to allocate"); - return false; - } - -#ifdef __unix__ - // Prevent from being killed by Ooo Killer of Linux - OPENVINO_ASSERT(!exceed_available_mem_size, - "[GPU] Exceeded max size of memory allocation: ", - "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", - "but available memory size is ", get_max_memory_size(), " bytes"); -#else - if (exceed_available_mem_size) { - GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " - << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; - GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; - } -#endif - - return true; -} - memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); @@ -304,9 +258,10 @@ void* ocl_engine::get_user_context() const { return static_cast(cl_device.get_context().get()); } -kernel::ptr ocl_engine::prepare_kernel(const kernel::ptr kernel) const { - OPENVINO_ASSERT(downcast(kernel.get()) != nullptr); - return kernel; +std::shared_ptr ocl_engine::create_kernel_builder() const { + auto cl_device = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(cl_device, "[GPU] Invalid device type for ocl_engine"); + return std::make_shared(*cl_device); } bool ocl_engine::extension_supported(std::string extension) const { @@ -321,10 +276,6 @@ stream::ptr ocl_engine::create_stream(const ExecutionConfig& config, void* handl return std::make_shared(*this, config, handle); } -stream& ocl_engine::get_service_stream() const { - return *_service_stream; -} - std::shared_ptr ocl_engine::create(const device::ptr device, runtime_types runtime_type) { return std::make_shared(device, runtime_type); } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index 0fa77922f03442..e9d152e20fd88c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -29,7 +29,6 @@ class ocl_engine : public engine { memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; - bool check_allocatable(const layout& layout, allocation_type type) override; void* get_user_context() const override; @@ -44,26 +43,17 @@ class ocl_engine : public engine { stream_ptr create_stream(const ExecutionConfig& config) const override; stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; - stream& get_service_stream() const override; - kernel::ptr prepare_kernel(const kernel::ptr kernel) const override; + std::shared_ptr create_kernel_builder() const override; #ifdef ENABLE_ONEDNN_FOR_GPU void create_onednn_engine(const ExecutionConfig& config) override; - // Returns onednn engine object which shares device and context with current engine - dnnl::engine& get_onednn_engine() const override; #endif static std::shared_ptr create(const device::ptr device, runtime_types runtime_type); private: std::string _extensions; - std::unique_ptr _service_stream; - -#ifdef ENABLE_ONEDNN_FOR_GPU - std::mutex onednn_mutex; - std::shared_ptr _onednn_engine; -#endif }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp index 29a27e5ea6acee..26fbe6fd272487 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp @@ -100,16 +100,6 @@ static const std::vector profiling_periods{ }; bool ocl_event::get_profiling_info_impl(std::list& info) { - if (duration_nsec.has_value()) { - auto stage = instrumentation::profiling_stage::executing; - auto duration = std::chrono::nanoseconds(duration_nsec.value()); - auto period = std::make_shared(duration); - - info.push_back({ stage, period }); - - return true; - } - if (!is_event_profiled(_event)) return true; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp index d51b7de50167b1..7efb87d8775405 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp @@ -24,10 +24,6 @@ struct ocl_event : public ocl_base_event { : ocl_base_event(queue_stamp) , _event(ev) {} - ocl_event(uint64_t duration_nsec, uint64_t queue_stamp = 0) - : ocl_base_event(queue_stamp) - , duration_nsec(duration_nsec) {} - cl::Event& get() override { return _event; } private: @@ -45,7 +41,6 @@ struct ocl_event : public ocl_base_event { protected: cl::Event _event; - std::optional duration_nsec; }; struct ocl_events : public ocl_base_event { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp new file mode 100644 index 00000000000000..7a59b3c4169b3b --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ocl_kernel.hpp" +#include + +namespace cldnn { +namespace ocl { + +std::vector ocl_kernel::get_binary() const { + // Get the corresponding program object for the kernel + cl_program program; + cl_int error = clGetKernelInfo(_compiled_kernel.get(), CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr); + if (error) { + throw std::runtime_error("Failed to retrieve CL_KERNEL_PROGRAM: " + std::to_string(error)); + } + + // Get the size of the program binary in bytes. + size_t binary_size = 0; + error = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, nullptr); + if (error) { + throw std::runtime_error("Failed to retrieve CL_PROGRAM_BINARY_SIZES: " + std::to_string(error)); + } + + // Binary is not available for the device. + if (binary_size == 0) + throw std::runtime_error("get_binary: Binary size is zero"); + + // Get program binary. + std::vector binary(binary_size); + uint8_t* binary_buffer = binary.data(); + error = clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary_size, &binary_buffer, nullptr); + if (error) { + throw std::runtime_error("Failed to retrieve CL_PROGRAM_BINARIES: " + std::to_string(error)); + } + + return binary; +} + +std::string ocl_kernel::get_build_log() const { + auto program = _compiled_kernel.getInfo(); + auto log = program.getBuildInfo(); + // Assume program was build for only 1 device + // Return first log + if (log.size() > 0) { + return log[0].second; + } + OPENVINO_THROW("[GPU] Failed to retrieve kernel build log"); +} + +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp index 206db55057cf5e..446fe51a7193c9 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel.hpp @@ -33,6 +33,16 @@ class ocl_kernel : public kernel { return std::make_shared(get_handle().clone(), _kernel_id); } + virtual bool is_same(const kernel &other) const { + auto other_ptr = dynamic_cast(&other); + if (other_ptr == nullptr) { + return false; + } + return get_handle().get() == other_ptr->get_handle().get(); + } + + std::vector get_binary() const override; + std::string get_build_log() const override; }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp new file mode 100644 index 00000000000000..97b624e4915b47 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_kernel_builder.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2016-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/kernel_builder.hpp" +#include "intel_gpu/runtime/device.hpp" + +#include "ocl_device.hpp" +#include "ocl_kernel.hpp" + + +namespace cldnn { +namespace ocl { + +class ocl_kernel_builder : public kernel_builder{ + public: + ocl_kernel_builder(const ocl_device &device) : m_device(device) {} + + void build_kernels(const void *src, + size_t src_bytes, + KernelFormat src_format, + const std::string &options, + std::vector &out) const override { + auto context = m_device.get_context().get(); + + cl_program program_handle; + cl_int err = CL_INVALID_VALUE; + switch (src_format) { + case KernelFormat::SOURCE: { + const char **strings = reinterpret_cast(&src); + const size_t *lenghts = &src_bytes; + const cl_uint count = 1; + program_handle = clCreateProgramWithSource(context, count, strings, lenghts, &err); + break; + } + case KernelFormat::NATIVE_BIN: { + const unsigned char **binaries = reinterpret_cast(&src); + const size_t *lenghts = &src_bytes; + const cl_device_id device_id = m_device.get_device().get(); + const cl_uint count = 1; + program_handle = clCreateProgramWithBinary(context, count, &device_id, lenghts, binaries, nullptr, &err); + break; + } + default: + OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format"); + break; + } + if (err != CL_SUCCESS) { + OPENVINO_THROW("[GPU] Failed to create program during kernel build process"); + } + cl::Program program(program_handle); + if (program.build({m_device.get_device()}, options.c_str()) != CL_SUCCESS) { + GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl; + auto log = program.getBuildInfo(); + for (auto &e : log) { + GPU_DEBUG_INFO << e.second; + } + GPU_DEBUG_INFO << "-------- End of Kernel build error" << std::endl; + OPENVINO_THROW("[GPU] Failed to build program"); + } + cl::vector kernels; + if (program.createKernels(&kernels) != CL_SUCCESS) { + OPENVINO_THROW("[GPU] Failed to create kernels"); + } + for (auto& k : kernels) { + const auto &entry_point = k.getInfo(); + out.push_back(std::make_shared(ocl::ocl_kernel_type(k, m_device.get_usm_helper()), entry_point)); + } + } + + private: + const ocl_device &m_device; +}; +} // namespace ocl +} // namespace cldnn + diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index fc238724f6e30f..cd41fa46f3bcec 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -229,7 +229,11 @@ event::ptr gpu_buffer::copy_to(stream& stream, void* data_ptr, size_t src_offset dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const { auto onednn_engine = _engine->get_onednn_engine(); dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE); +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get()); +#endif return dnnl_mem; } #endif @@ -661,9 +665,13 @@ event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, s #ifdef ENABLE_ONEDNN_FOR_GPU dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const { auto onednn_engine = _engine->get_onednn_engine(); +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm, reinterpret_cast(_buffer.get()) + offset); return dnnl_mem; +#endif } #endif diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp index f5e9b74a5e681b..a7a02d69f205f7 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp @@ -11,6 +11,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "ocl_kernel.hpp" #include "ocl_common.hpp" +#include "ocl_memory.hpp" #include #include @@ -239,7 +240,11 @@ dnnl::stream& ocl_stream::get_onednn_stream() { OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue"); OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices"); if (!_onednn_stream) { +#ifdef OV_GPU_WITH_ZE_RT + OPENVINO_THROW("[GPU] Using OCL OneDNN API with L0 runtime"); +#else _onednn_stream = std::make_shared(dnnl::ocl_interop::make_stream(_engine.get_onednn_engine(), _command_queue.get())); +#endif } return *_onednn_stream; @@ -362,6 +367,10 @@ event::ptr ocl_stream::create_base_event() { return std::make_shared(ret_ev, ++_queue_counter); } +std::unique_ptr ocl_stream::create_surfaces_lock(const std::vector &mem) const { + return std::unique_ptr(new ocl::ocl_surfaces_lock(mem, *this)); +} + void ocl_stream::flush() const { try { get_cl_queue().flush(); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp index b9c51ccb046508..c86089bce08b46 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp @@ -47,6 +47,7 @@ class ocl_stream : public stream { void enqueue_barrier() override; event::ptr create_user_event(bool set) override; event::ptr create_base_event() override; + std::unique_ptr create_surfaces_lock(const std::vector &mem) const override; const cl::UsmHelper& get_usm_helper() const { return _engine.get_usm_helper(); } diff --git a/src/plugins/intel_gpu/src/runtime/stream.cpp b/src/plugins/intel_gpu/src/runtime/stream.cpp index 913d84d8f476f5..2bd8a74f857a4d 100644 --- a/src/plugins/intel_gpu/src/runtime/stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/stream.cpp @@ -4,21 +4,10 @@ #include "intel_gpu/runtime/stream.hpp" -#include "ocl/ocl_stream.hpp" - #include namespace cldnn { -QueueTypes stream::detect_queue_type(engine_types engine_type, void* queue_handle) { - switch (engine_type) { - case engine_types::sycl: - case engine_types::ocl: - return ocl::ocl_stream::detect_queue_type(queue_handle); - default: throw std::runtime_error("Invalid engine type"); - } -} - SyncMethods stream::get_expected_sync_method(const ExecutionConfig& config) { auto profiling = config.get_enable_profiling(); auto queue_type = config.get_queue_type(); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp new file mode 100644 index 00000000000000..0f121e0b8e45b6 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/event.hpp" +#include "ze_base_event_factory.hpp" + +#include +#include +#include + +namespace cldnn { +namespace ze { + +// Base interface for Level Zero events +struct ze_base_event : public event { +public: + explicit ze_base_event(uint64_t queue_stamp) + : event() + , m_queue_stamp(queue_stamp) { } + uint64_t get_queue_stamp() const { return m_queue_stamp; } + void set_queue_stamp(uint64_t val) { m_queue_stamp = val; } + + virtual ze_event_handle_t get_handle() const = 0; + virtual std::optional query_timestamp() = 0; + +protected: + uint64_t m_queue_stamp = 0; + + static std::chrono::nanoseconds timestamp_to_duration(const device_info &info, const ze_kernel_timestamp_data_t& timestamp) { + constexpr double NS_IN_SEC = 1000000000.0; + const double timestamp_freq = NS_IN_SEC / info.timer_resolution; + const uint64_t timestamp_max_value = ~(-1L << info.kernel_timestamp_valid_bits); + + auto d = (timestamp.kernelEnd >= timestamp.kernelStart) ? + (timestamp.kernelEnd - timestamp.kernelStart) * timestamp_freq + : ((timestamp_max_value - timestamp.kernelStart) + timestamp.kernelEnd + 1) * timestamp_freq; + return std::chrono::nanoseconds(static_cast(d)); + } +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp new file mode 100644 index 00000000000000..8a3febc28f0393 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_base_event_factory.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_engine.hpp" +#include "intel_gpu/runtime/event.hpp" + +namespace cldnn { +namespace ze { + +// Interface for creating Level Zero events +struct ze_base_event_factory { +public: + ze_base_event_factory(const ze_engine &engine, bool enable_profiling) + : m_engine(engine), m_profiling_enabled(enable_profiling) {} + const ze_engine& get_engine() const { return m_engine; } + bool is_profiling_enabled() const { return m_profiling_enabled; } + + virtual ~ze_base_event_factory() {} + virtual event::ptr create_event(uint64_t queue_stamp) = 0; +protected: + const ze_engine& m_engine; + const bool m_profiling_enabled; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp new file mode 100644 index 00000000000000..d7456d74c7459f --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_common.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "intel_gpu/runtime/debug_configuration.hpp" + +#include + +#include +#include + + +// Expect success of level zero command, throw runtime error otherwise +#define OV_ZE_EXPECT(f) \ + do { \ + ze_result_t res_ = (f); \ + if (res_ != ZE_RESULT_SUCCESS) { \ + throw std::runtime_error(#f " command failed with code " + std::to_string(res_)); \ + } \ + } while (false) + +// Prints warning if level zero command does not return success result +#define OV_ZE_WARN(f) \ + do { \ + ze_result_t res_ = (f); \ + if (res_ != ZE_RESULT_SUCCESS) { \ + GPU_DEBUG_COUT << ("[Warning] [GPU] " #f " command failed with code " + std::to_string(res_)); \ + } \ + } while (false) + +namespace cldnn { +namespace ze { + +static constexpr uint64_t endless_wait = std::numeric_limits::max(); +static constexpr ze_module_format_t ze_module_format_oclc = (ze_module_format_t) 3U; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp new file mode 100644 index 00000000000000..03e342d0f571ad --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_counter_based_event.hpp" +#include "ze/ze_common.hpp" + +#include +#include +#include + +using namespace cldnn; +using namespace ze; + +void ze_counter_based_event::wait_impl() { + OV_ZE_EXPECT(zeEventHostSynchronize(m_event, endless_wait)); +} + +void ze_counter_based_event::set_impl() { + // Counter based events start in signaled state and can not be signaled from host +} + +bool ze_counter_based_event::is_set_impl() { + auto ret = zeEventQueryStatus(m_event); + switch (ret) { + case ZE_RESULT_SUCCESS: + return true; + break; + case ZE_RESULT_NOT_READY: + return false; + break; + default: + OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret)); + break; + } +} + +ze_event_handle_t ze_counter_based_event::get_handle() const { + return m_event; +} + +std::optional ze_counter_based_event::query_timestamp() { + if (!m_factory.is_profiling_enabled()) { + return std::nullopt; + } + ze_kernel_timestamp_result_t timestamp{}; + OV_ZE_EXPECT(zeEventQueryKernelTimestamp(m_event, ×tamp)); + return timestamp; +} + +bool ze_counter_based_event::get_profiling_info_impl(std::list& info) { + auto opt_timestamp = query_timestamp(); + if (!opt_timestamp.has_value()) { + return true; + } + ze_kernel_timestamp_result_t timestamp = opt_timestamp.value(); + auto &dev_info = m_factory.get_engine().get_device_info(); + auto wallclock_time = timestamp_to_duration(dev_info, timestamp.global); + auto exec_time = timestamp_to_duration(dev_info, timestamp.context); + + auto period_exec = std::make_shared(timestamp_to_duration(dev_info, timestamp.context)); + auto period_submit = std::make_shared(wallclock_time - exec_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} + +ze_counter_based_event::~ze_counter_based_event() { + OV_ZE_WARN(zeEventDestroy(m_event)); +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.hpp new file mode 100644 index 00000000000000..8c8131853c0b69 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" + +namespace cldnn { +namespace ze { + + +// L0 counter based event. +// Signaled state is inferred from the number of tasks completed on device. +// Resetting counter based event is not allowed. +// Start in signaled state and signaling from host is not allowed. +// Can only be used with in-order command lists. +struct ze_counter_based_event : public ze_base_event { +public: + // Take ownership of counter based event handle + ze_counter_based_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev) + : ze_base_event(queue_stamp) + , m_factory(factory) + , m_event(ev) { + // Ensure event handle is not null + if (ev == nullptr) { + OPENVINO_THROW("[GPU] Trying to create event with null handle"); + } + } + ze_counter_based_event(const ze_counter_based_event&) = delete; + ze_counter_based_event& operator=(const ze_counter_based_event&) = delete; + ~ze_counter_based_event(); + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + ze_event_handle_t get_handle() const override; + std::optional query_timestamp() override; + bool get_profiling_info_impl(std::list& info) override; + +protected: + const ze_base_event_factory& m_factory; + ze_event_handle_t m_event; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp new file mode 100644 index 00000000000000..09ad6800869af5 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_counter_based_event_factory.hpp" +#include "ze_common.hpp" +#include "ze_counter_based_event.hpp" + +#include "zex_event.h" + +using namespace cldnn; +using namespace ze; +namespace { + decltype(zexCounterBasedEventCreate2) *func_zexCounterBasedEventCreate2 = nullptr; + void find_function_address(ze_driver_handle_t driver) { + OV_ZE_EXPECT(zeDriverGetExtensionFunctionAddress(driver, + "zexCounterBasedEventCreate2", + reinterpret_cast(&func_zexCounterBasedEventCreate2))); + } +} + +ze_counter_based_event_factory::ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling) + : ze_base_event_factory(engine, enable_profiling) { + if (func_zexCounterBasedEventCreate2 == nullptr) { + find_function_address(engine.get_driver()); + } +} + +event::ptr ze_counter_based_event_factory::create_event(uint64_t queue_stamp) { + std::lock_guard lock(_mutex); + + ze_event_handle_t event; + auto desc = defaultIntelCounterBasedEventDesc; + if (is_profiling_enabled()) { + desc.flags |= ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP; + } + OV_ZE_EXPECT(func_zexCounterBasedEventCreate2(m_engine.get_context(), m_engine.get_device(), &desc, &event)); + auto cb_event = std::make_shared(queue_stamp, *this, event); + return cb_event; +} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp new file mode 100644 index 00000000000000..6a1c93728d93dc --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_counter_based_event_factory.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event_factory.hpp" + +#include + +namespace cldnn { +namespace ze { + +// Interface for creating l0 counter based events +// Should only be used with in-order queue +struct ze_counter_based_event_factory : public ze_base_event_factory { +public: + ze_counter_based_event_factory(const ze_engine &engine, bool enable_profiling); + event::ptr create_event(uint64_t queue_stamp) override; +protected: + std::mutex _mutex; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp new file mode 100644 index 00000000000000..e6f965d6416929 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.cpp @@ -0,0 +1,373 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_device.hpp" +#include "ze_common.hpp" +#include "zex_common.h" + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +#ifdef ENABLE_ONEDNN_FOR_GPU +#include "gpu/intel/jit/generator.hpp" +#endif + +namespace cldnn { +namespace ze { + +namespace { +#ifdef ENABLE_ONEDNN_FOR_GPU +//TODO merge this with ocl_device +gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { + switch (gpu_arch) { + case ngen::HW::Gen9: return gpu_arch::gen9; + case ngen::HW::Gen11: return gpu_arch::gen11; + case ngen::HW::XeLP: return gpu_arch::xe_lp; + case ngen::HW::XeHP: return gpu_arch::xe_hp; + case ngen::HW::XeHPG: return gpu_arch::xe_hpg; + case ngen::HW::XeHPC: return gpu_arch::xe_hpc; + case ngen::HW::Xe2: return gpu_arch::xe2; + case ngen::HW::Xe3: return gpu_arch::xe3; + case ngen::HW::Gen10: + case ngen::HW::Unknown: return gpu_arch::unknown; + } + return gpu_arch::unknown; +} +#endif + +gfx_version parse_version(uint32_t gmdid) { + union GMDID { + uint32_t value; + struct { + uint32_t revision : 6; + uint32_t reserved : 8; + uint32_t release : 8; + uint32_t architecture : 10; + }; + }; + + GMDID gmd_id = {gmdid}; + if (gmd_id.architecture > 0 && gmd_id.architecture < 100) { + // New format + return { static_cast(gmd_id.architecture), static_cast(gmd_id.release), static_cast(gmd_id.revision)}; + } else { + // Old format + uint32_t ver = gmdid; + uint16_t major = ver >> 16; + uint8_t minor = (ver >> 8) & 0xFF; + uint8_t revision = ver & 0xFF; + + return {major, minor, revision}; + } +} + +bool supports_extension(const std::vector& extensions, const std::string& ext_name, uint32_t ext_ver) { + return std::find_if(extensions.begin(), extensions.end(), [&ext_name, &ext_ver](const ze_driver_extension_properties_t& ep) { + return std::string(ep.name) == ext_name && ep.version == ext_ver; + }) != extensions.end(); +} + +device_info init_device_info(ze_driver_handle_t driver, ze_device_handle_t device) { + device_info info; + + uint32_t num_ext = 0; + OV_ZE_EXPECT(zeDriverGetExtensionProperties(driver, &num_ext, nullptr)); + + std::vector extensions(num_ext); + OV_ZE_EXPECT(zeDriverGetExtensionProperties(driver, &num_ext, &extensions[0])); + + ze_driver_properties_t driver_properties{ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES}; + OV_ZE_EXPECT(zeDriverGetProperties(driver, &driver_properties)); + + bool supports_luid = supports_extension(extensions, ZE_DEVICE_LUID_EXT_NAME, ZE_DEVICE_LUID_EXT_VERSION_1_0); + bool supports_ip_version = supports_extension(extensions, ZE_DEVICE_IP_VERSION_EXT_NAME, ZE_DEVICE_IP_VERSION_VERSION_1_0); + bool supports_mutable_list = supports_extension(extensions, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0); + bool supports_pci_properties = supports_extension(extensions, ZE_PCI_PROPERTIES_EXT_NAME, ZE_PCI_PROPERTIES_EXT_VERSION_1_0); + bool supports_counter_based_events = supports_extension(extensions, ZEX_COUNTER_BASED_EVENT_EXT_NAME, ZEX_COUNTER_BASED_EVENT_VERSION_1_0); + bool supports_cp_offload = + supports_extension(extensions, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME, ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0); + bool supports_dp_properties = + supports_extension(extensions, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME, ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0); + + void *device_properties_next = nullptr; + ze_device_ip_version_ext_t ip_version_properties = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, device_properties_next, 0}; + if (supports_ip_version) { + device_properties_next = &ip_version_properties; + } + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2, device_properties_next}; + OV_ZE_EXPECT(zeDeviceGetProperties(device, &device_properties)); + + ze_device_compute_properties_t device_compute_properties{ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES}; + OV_ZE_EXPECT(zeDeviceGetComputeProperties(device, &device_compute_properties)); + + uint32_t queue_properties_count = 0; + OV_ZE_EXPECT(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, nullptr)); + + std::vector queue_properties(queue_properties_count); + for (auto& mp : queue_properties) { + mp.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; + } + + OV_ZE_EXPECT(zeDeviceGetCommandQueueGroupProperties(device, &queue_properties_count, &queue_properties[0])); + + auto compute_queue_props = std::find_if(queue_properties.begin(), queue_properties.end(), [](const ze_command_queue_group_properties_t& qp) { + return (qp.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0; + }); + + OPENVINO_ASSERT(compute_queue_props != queue_properties.end()); + + uint32_t memory_properties_count = 0; + OV_ZE_EXPECT(zeDeviceGetMemoryProperties(device, &memory_properties_count, nullptr)); + + std::vector device_memory_properties(memory_properties_count); + for (auto& mp : device_memory_properties) { + mp.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES; + } + OV_ZE_EXPECT(zeDeviceGetMemoryProperties(device, &memory_properties_count, &device_memory_properties[0])); + + ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES}; + OV_ZE_EXPECT(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); + + auto mem_properties = std::find_if(device_memory_properties.begin(), device_memory_properties.end(), [](const ze_device_memory_properties_t& p) { + auto name = std::string(p.name); + return name == "DDR" || name == "HBM"; + }); + + ze_device_module_properties_t device_module_properties{ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES}; + ze_intel_device_module_dp_exp_properties_t dp_properties{ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES, nullptr}; + if (supports_dp_properties) { + device_module_properties.pNext = &dp_properties; + } + OV_ZE_EXPECT(zeDeviceGetModuleProperties(device, &device_module_properties)); + + ze_device_image_properties_t device_image_properties{ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES}; + OV_ZE_EXPECT(zeDeviceGetImageProperties(device, &device_image_properties)); + + info.vendor_id = device_properties.vendorId; + info.dev_name = device_properties.name; + // L0 returns drivers version in different format than OCL + info.driver_version = std::to_string(driver_properties.driverVersion); + info.dev_type = (device_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? device_type::integrated_gpu : device_type::discrete_gpu; + + info.execution_units_count = device_properties.numEUsPerSubslice * device_properties.numSubslicesPerSlice * device_properties.numSlices; + + info.gpu_frequency = device_properties.coreClockRate; + + info.supported_simd_sizes.resize(device_compute_properties.numSubGroupSizes); + std::copy_n(device_compute_properties.subGroupSizes, device_compute_properties.numSubGroupSizes, info.supported_simd_sizes.begin()); + info.has_separate_cache = true; + + info.max_work_group_size = device_compute_properties.maxTotalGroupSize; + info.max_local_mem_size = device_compute_properties.maxSharedLocalMemory; + uint32_t cache_properties_count = 0; + OV_ZE_EXPECT(zeDeviceGetCacheProperties(device, &cache_properties_count, nullptr)); + info.max_global_cache_size = 0; + if (cache_properties_count > 0) { + std::vector cache_properties(cache_properties_count); + OV_ZE_EXPECT(zeDeviceGetCacheProperties(device, &cache_properties_count, cache_properties.data())); + // Assume first property is L3 cache + info.max_global_cache_size = cache_properties[0].cacheSize; + } + + if (mem_properties != device_memory_properties.end()) { + info.max_global_mem_size = mem_properties->totalSize; + info.device_memory_ordinal = std::distance(device_memory_properties.begin(), mem_properties); + } else { + info.max_global_mem_size = 0; + info.device_memory_ordinal = 0; + } + + info.max_alloc_mem_size = device_properties.maxMemAllocSize; + + info.supports_image = device_image_properties.maxSamplers > 0; + info.max_image2d_width = device_image_properties.maxImageDims2D; + info.max_image2d_height = device_image_properties.maxImageDims2D; + + info.supports_fp16 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP16) != 0; + info.supports_fp64 = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_FP64) != 0; + info.supports_fp16_denorms = info.supports_fp16 && (device_module_properties.fp16flags & ZE_DEVICE_FP_FLAG_DENORM) != 0; + + info.supports_cp_offload = supports_cp_offload; + info.supports_counter_based_events = supports_counter_based_events; + + info.supports_imad = (device_module_properties.flags & ZE_DEVICE_MODULE_FLAG_DP4A) != 0; + info.supports_immad = supports_dp_properties && (dp_properties.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS) != 0; + + info.supports_usm = device_memory_access_properties.hostAllocCapabilities && device_memory_access_properties.deviceAllocCapabilities; + + // FIXME: Could not find how to retrieve those from L0 + info.supports_work_group_collective_functions = false; + info.supports_intel_planar_yuv = false; + info.supports_khr_subgroups = true; + info.supports_intel_subgroups = true; + info.supports_intel_subgroups_short = true; + info.supports_intel_subgroups_char = true; + info.supports_intel_required_subgroup_size = true; + info.supports_queue_families = true; + + if (supports_ip_version) { + info.ip_version = ip_version_properties.ipVersion; + info.gfx_ver = parse_version(ip_version_properties.ipVersion); + } + info.sub_device_idx = (std::numeric_limits::max)(); + + info.device_id = device_properties.deviceId; + info.num_slices = device_properties.numSlices; + info.num_sub_slices_per_slice = device_properties.numSubslicesPerSlice; + info.num_eus_per_sub_slice = device_properties.numEUsPerSubslice; + info.num_threads_per_eu = device_properties.numThreadsPerEU; + + info.num_ccs = compute_queue_props->numQueues; + + info.kernel_timestamp_valid_bits = device_properties.kernelTimestampValidBits; + info.timer_resolution = device_properties.timerResolution; + info.compute_queue_group_ordinal = std::distance(queue_properties.begin(), compute_queue_props); + + static_assert(ZE_MAX_DEVICE_UUID_SIZE == ov::device::UUID::MAX_UUID_SIZE, ""); + static_assert(ZE_MAX_DEVICE_LUID_SIZE_EXT == ov::device::LUID::MAX_LUID_SIZE, ""); + std::copy_n(&device_properties.uuid.id[0], ZE_MAX_DEVICE_UUID_SIZE, info.uuid.uuid.begin()); + + if (supports_luid) { + ze_device_luid_ext_properties_t luid_props{ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES, nullptr}; + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &luid_props}; + if (zeDeviceGetProperties(device, &device_properties) == ZE_RESULT_SUCCESS) + std::copy_n(&luid_props.luid.id[0], ZE_MAX_DEVICE_LUID_SIZE_EXT, info.luid.luid.begin()); + } + + info.supports_mutable_command_list = false; + if (supports_mutable_list) { + ze_mutable_command_list_exp_properties_t mutable_list_props = { ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_PROPERTIES, nullptr, 0, 0 }; + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &mutable_list_props}; + if (zeDeviceGetProperties(device, &device_properties) == ZE_RESULT_SUCCESS) { + ze_mutable_command_exp_flags_t required_features = ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION | + ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | + ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET | + ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT | + ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS; + + info.supports_mutable_command_list = (mutable_list_props.mutableCommandFlags & required_features) == required_features; + } + } + if (supports_pci_properties) { + ze_pci_ext_properties_t pci_properties{ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES, nullptr}; + if (zeDevicePciGetPropertiesExt(device, &pci_properties) == ZE_RESULT_SUCCESS) { + info.pci_info.pci_bus = pci_properties.address.bus; + info.pci_info.pci_device = pci_properties.address.device; + info.pci_info.pci_domain = pci_properties.address.domain; + info.pci_info.pci_function = pci_properties.address.function; + } + } + +#ifdef ENABLE_ONEDNN_FOR_GPU + using namespace dnnl::impl::gpu::intel::jit; + // Create temporary context just for OneDNN HW detection + ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 }; + ze_context_handle_t context; + OV_ZE_EXPECT(zeContextCreate(driver, &context_desc, &context)); + ngen::Product product = ngen::LevelZeroCodeGenerator::detectHWInfo(context, device); + zeContextDestroy(context); + info.arch = convert_ngen_arch(ngen::getCore(product.family)); + + if (product.family == ngen::ProductFamily::Unknown) { + info.supports_immad = false; + } +#else // ENABLE_ONEDNN_FOR_GPU + info.arch = gpu_arch::unknown; +#endif // ENABLE_ONEDNN_FOR_GPU + + return info; +} + +memory_capabilities init_memory_caps(ze_device_handle_t device, const device_info& info) { + std::vector memory_caps; + + ze_device_memory_access_properties_t device_memory_access_properties{ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES}; + OV_ZE_EXPECT(zeDeviceGetMemoryAccessProperties(device, &device_memory_access_properties)); + + if (info.supports_usm) { + if (device_memory_access_properties.hostAllocCapabilities) { + memory_caps.push_back(allocation_type::usm_host); + } + if (device_memory_access_properties.sharedSingleDeviceAllocCapabilities) { + memory_caps.push_back(allocation_type::usm_shared); + } + if (device_memory_access_properties.deviceAllocCapabilities) { + memory_caps.push_back(allocation_type::usm_device); + } + } + + return memory_capabilities(memory_caps); +} + +} // namespace + + +ze_device::ze_device(ze_driver_handle_t driver, ze_device_handle_t device, bool initialize) +: _driver(driver) +, _device(device) +, _info(init_device_info(driver, device)) +, _mem_caps(init_memory_caps(device, _info)) { + if (initialize) { + this->initialize(); + } +} + +void ze_device::initialize() { + if (_is_initialized) + return; + + ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 }; + OV_ZE_EXPECT(zeContextCreate(_driver, &context_desc, &_context)); + _is_initialized = true; +} + +bool ze_device::is_initialized() const { + return _is_initialized; +} + +bool ze_device::is_same(const device::ptr other) { + auto casted = downcast(other.get()); + if (!casted) + return false; + + if (is_initialized() && casted->is_initialized()) { + // Do not compare contexts as one driver can have many different contexts + return _device == casted->get_device() && _driver == casted->get_driver(); + } + return _info.is_same_device(casted->_info); +} + +void ze_device::set_mem_caps(const memory_capabilities& memory_capabilities) { + _mem_caps = memory_capabilities; +} + +ze_device::~ze_device() { + //FIXME segfault + //if (_is_initialized) + // zeContextDestroy(_context); +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp new file mode 100644 index 00000000000000..1a20685ed2cc77 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/device.hpp" +#include + +namespace cldnn { +namespace ze { + +struct ze_device : public device { +public: + ze_device(ze_driver_handle_t driver, ze_device_handle_t device, bool initialize = true); + + const device_info& get_info() const override { return _info; } + memory_capabilities get_mem_caps() const override { return _mem_caps; } + + void initialize() override; + bool is_initialized() const override; + + const ze_driver_handle_t get_driver() const { return _driver; } + const ze_device_handle_t get_device() const { return _device; } + const ze_context_handle_t get_context() const { return _context; } + + bool is_same(const device::ptr other) override; + void set_mem_caps(const memory_capabilities& memory_capabilities) override; + + ~ze_device(); + +private: + ze_driver_handle_t _driver = nullptr; + ze_device_handle_t _device = nullptr; + ze_context_handle_t _context = nullptr; + bool _is_initialized = false; + + device_info _info; + memory_capabilities _mem_caps; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp new file mode 100644 index 00000000000000..a40f4a7b690185 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_device_detector.hpp" +#include "ze_device.hpp" +#include "ze_common.hpp" +#include +#include "intel_gpu/runtime/debug_configuration.hpp" +#include "openvino/core/except.hpp" + +#include + +namespace cldnn { +namespace ze { + +static std::vector get_sub_devices(ze_device_handle_t root_device) { + uint32_t n_subdevices = 0; + OV_ZE_EXPECT(zeDeviceGetSubDevices(root_device, &n_subdevices, nullptr)); + if (n_subdevices == 0) + return {}; + + std::vector subdevices(n_subdevices); + + OV_ZE_EXPECT(zeDeviceGetSubDevices(root_device, &n_subdevices, &subdevices[0])); + + return subdevices; +} + +std::map ze_device_detector::get_available_devices(void* user_context, + void* user_device, + int ctx_device_id, + int target_tile_id, + bool initialize_devices) const { + std::vector devices_list; + if (user_context != nullptr) { + devices_list = create_device_list_from_user_context(user_context, ctx_device_id); + } else if (user_device != nullptr) { + devices_list = create_device_list_from_user_device(user_device); + } else { + devices_list = create_device_list(initialize_devices); + } + + devices_list = sort_devices(devices_list); + + std::map ret; + uint32_t idx = 0; + for (auto& dptr : devices_list) { + auto map_id = std::to_string(idx++); + ret[map_id] = dptr; + + auto root_device = std::dynamic_pointer_cast(dptr); + OPENVINO_ASSERT(root_device != nullptr, "[GPU] Invalid device type created in ocl_device_detector"); + + auto sub_devices = get_sub_devices(root_device->get_device()); + if (!sub_devices.empty()) { + uint32_t sub_idx = 0; + for (auto& sub_device : sub_devices) { + if (target_tile_id != -1 && static_cast(sub_idx) != target_tile_id) { + sub_idx++; + continue; + } + auto sub_device_ptr = std::make_shared(root_device->get_driver(), sub_device, initialize_devices); + ret[map_id + "." + std::to_string(sub_idx++)] = sub_device_ptr; + } + } + } + + return ret; +} + +std::vector ze_device_detector::create_device_list(bool initialize_devices) const { + std::vector ret; + + OV_ZE_EXPECT(zeInit(ZE_INIT_FLAG_GPU_ONLY)); + + uint32_t driver_count = 0; + OV_ZE_EXPECT(zeDriverGet(&driver_count, nullptr)); + + std::vector all_drivers(driver_count); + OV_ZE_EXPECT(zeDriverGet(&driver_count, &all_drivers[0])); + + for (uint32_t i = 0; i < driver_count; ++i) { + uint32_t device_count = 0; + OV_ZE_EXPECT(zeDeviceGet(all_drivers[i], &device_count, nullptr)); + + std::vector all_devices(device_count); + OV_ZE_EXPECT(zeDeviceGet(all_drivers[i], &device_count, &all_devices[0])); + + for (uint32_t d = 0; d < device_count; ++d) { + try { + ze_device_properties_t device_properties{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + OV_ZE_EXPECT(zeDeviceGetProperties(all_devices[d], &device_properties)); + + if (ZE_DEVICE_TYPE_GPU == device_properties.type) { + ret.emplace_back(std::make_shared(all_drivers[i], all_devices[d], initialize_devices)); + } + } catch (std::exception& ex) { + GPU_DEBUG_LOG << "Devices query/creation failed for driver " << i << ex.what() << std::endl; + GPU_DEBUG_LOG << "Platform is skipped" << std::endl; + continue; + } + } + } + + return ret; +} + +std::vector ze_device_detector::create_device_list_from_user_context(void* user_context, int ctx_device_id) const { + OPENVINO_NOT_IMPLEMENTED; +} + +std::vector ze_device_detector::create_device_list_from_user_device(void* user_device) const { + OPENVINO_NOT_IMPLEMENTED; +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp new file mode 100644 index 00000000000000..deeefb36234a59 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_device_detector.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/device.hpp" + +#include +#include +#include + +namespace cldnn { +namespace ze { + +class ze_device_detector { +public: + ze_device_detector() = default; + + std::map get_available_devices(void* user_context, + void* user_device, + int ctx_device_id, + int target_tile_id, + bool initialize_devices = false) const; +private: + std::vector create_device_list(bool initialize_devices) const; + std::vector create_device_list_from_user_context(void* user_context, int ctx_device_id = 0) const; + std::vector create_device_list_from_user_device(void* user_device) const; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp new file mode 100644 index 00000000000000..0d16bfeb9d2280 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_empty_event.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" + +namespace cldnn { +namespace ze { + + +// Event that does not have underlying Level Zero event object. +// It is always in signaled state. +struct ze_empty_event : public ze_base_event { +public: + ze_empty_event(uint64_t queue_stamp) + : ze_base_event(queue_stamp) { } + + void wait_impl() override { } + void set_impl() override { } + bool is_set_impl() override { return true; } + ze_event_handle_t get_handle() const override { return nullptr; } + std::optional query_timestamp() override { return std::nullopt; } + bool get_profiling_info_impl(std::list& info) override { + return true; + } +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp new file mode 100644 index 00000000000000..5168e0364c4bd8 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -0,0 +1,180 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_engine.hpp" +#include "intel_gpu/runtime/utils.hpp" +#include "openvino/core/except.hpp" +#include "ze_kernel_builder.hpp" +#include "ze_api.h" +#include "ze_engine_factory.hpp" +#include "ze_common.hpp" +#include "ze_memory.hpp" +#include "ze_stream.hpp" +#include "ze_device.hpp" +#include "ze_kernel.hpp" +#include "ze_module_holder.hpp" +#include "ze_kernel_holder.hpp" +#include +#include +#include +#include + +#ifdef ENABLE_ONEDNN_FOR_GPU +#include +#endif +namespace cldnn { +namespace ze { + +ze_engine::ze_engine(const device::ptr dev, runtime_types runtime_type) + : engine(dev) { + OPENVINO_ASSERT(runtime_type == runtime_types::ze, "[GPU] Invalid runtime type specified for ZE engine. Only ZE runtime is supported"); + + auto casted = dynamic_cast(dev.get()); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type passed to ze engine"); + + _service_stream.reset(new ze_stream(*this, ExecutionConfig())); +} + +#ifdef ENABLE_ONEDNN_FOR_GPU +void ze_engine::create_onednn_engine(const ExecutionConfig& config) { + const std::lock_guard lock(onednn_mutex); + OPENVINO_ASSERT(_device->get_info().vendor_id == INTEL_VENDOR_ID, "[GPU] OneDNN engine can be used for Intel GPUs only"); + if (!_onednn_engine) { + auto casted = std::dynamic_pointer_cast(_device); + _onednn_engine = std::make_shared(dnnl::l0_interop::make_engine(casted->get_driver(), casted->get_device(), casted->get_context())); + } +} +#endif + +const ze_driver_handle_t ze_engine::get_driver() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return casted->get_driver(); +} + +const ze_context_handle_t ze_engine::get_context() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return casted->get_context(); +} + +const ze_device_handle_t ze_engine::get_device() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return casted->get_device(); +} + +allocation_type ze_engine::detect_usm_allocation_type(const void* memory) const { + return ze::gpu_usm::detect_allocation_type(this, memory); +} + +memory::ptr ze_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { + OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); + + check_allocatable(layout, type); + + try { + memory::ptr res = std::make_shared(this, layout, type); + + if (reset || res->is_memory_reset_needed(layout)) { + auto ev = res->fill(get_service_stream()); + if (ev) { + get_service_stream().wait_for_events({ev}); + } + } + + return res; + } catch (const std::exception& e) { + OPENVINO_THROW("[GPU] Failed to allocate memory: ", e.what()); + } +} + +memory::ptr ze_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) { + OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine"); + OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(), + "[GPU] trying to reinterpret between image and non-image layouts. Current: ", + memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string()); + + if (memory_capabilities::is_usm_type(memory.get_allocation_type())) { + return std::make_shared(this, + new_layout, + reinterpret_cast(memory).get_buffer(), + memory.get_allocation_type(), + memory.get_mem_tracker()); + } + + OPENVINO_THROW("[GPU] Trying to reinterpret non usm buffer"); +} + +memory::ptr ze_engine::reinterpret_handle(const layout& new_layout, shared_mem_params params) { + if (params.mem_type == shared_mem_type::shared_mem_usm) { + ze::UsmMemory usm_buffer(get_context(), get_device(), params.mem); + size_t actual_mem_size = 0; + zeMemGetAddressRange(get_context(), params.mem, nullptr, &actual_mem_size); + auto requested_mem_size = new_layout.bytes_count(); + OPENVINO_ASSERT(actual_mem_size >= requested_mem_size, + "[GPU] shared USM buffer has smaller size (", actual_mem_size, + ") than specified layout (", requested_mem_size, ")"); + return std::make_shared(this, new_layout, usm_buffer, nullptr); + } else { + OPENVINO_THROW("[GPU] Unsupported shared memory type: ", params.mem_type); + } +} + +memory_ptr ze_engine::create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) { + OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] Trying to create a subbuffer from a buffer allocated by a different engine"); + if (new_layout.format.is_image_2d()) { + OPENVINO_NOT_IMPLEMENTED; + } + OPENVINO_ASSERT(memory_capabilities::is_usm_type(memory.get_allocation_type()), "[GPU] Trying to create subbuffer for non usm memory"); + auto& new_buf = reinterpret_cast(memory); + auto ptr = new_buf.get_buffer().get(); + auto sub_buffer = ze::UsmMemory(get_context(), get_device(), ptr, byte_offset); + return std::make_shared(this, + new_layout, + sub_buffer, + memory.get_allocation_type(), + memory.get_mem_tracker()); +} + +bool ze_engine::is_the_same_buffer(const memory& mem1, const memory& mem2) { + if (mem1.get_engine() != this || mem2.get_engine() != this) + return false; + if (mem1.get_allocation_type() != mem2.get_allocation_type()) + return false; + if (&mem1 == &mem2) + return true; + + return (reinterpret_cast(mem1).get_buffer().get() == reinterpret_cast(mem2).get_buffer().get()); +} + +std::shared_ptr ze_engine::create_kernel_builder() const { + auto casted = std::dynamic_pointer_cast(_device); + OPENVINO_ASSERT(casted, "[GPU] Invalid device type for ze_engine"); + return std::make_shared(*casted); +} + +void* ze_engine::get_user_context() const { + auto& casted = downcast(*_device); + return static_cast(casted.get_context()); +} + +stream::ptr ze_engine::create_stream(const ExecutionConfig& config) const { + return std::make_shared(*this, config); +} + +stream::ptr ze_engine::create_stream(const ExecutionConfig& config, void* handle) const { + OPENVINO_NOT_IMPLEMENTED; +} + +std::shared_ptr ze_engine::create(const device::ptr device, runtime_types runtime_type) { + return std::make_shared(device, runtime_type); +} + +std::shared_ptr create_ze_engine(const device::ptr device, runtime_types runtime_type) { + return ze_engine::create(device, runtime_type); +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp new file mode 100644 index 00000000000000..24d0736b3fbbfc --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/runtime/engine.hpp" +#include "intel_gpu/runtime/stream.hpp" +#include "intel_gpu/runtime/device.hpp" + +#include + +namespace cldnn { +namespace ze { + +class ze_engine : public engine { +public: + ze_engine(const device::ptr dev, runtime_types runtime_type); + engine_types type() const override { return engine_types::ze; }; + runtime_types runtime_type() const override { return runtime_types::ze; }; + + memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override; + memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override; + memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) override; + memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; + bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; + + void* get_user_context() const override; + + allocation_type get_default_allocation_type() const override { return allocation_type::usm_device; } + allocation_type detect_usm_allocation_type(const void* memory) const override; + + const ze_context_handle_t get_context() const; + const ze_driver_handle_t get_driver() const; + const ze_device_handle_t get_device() const; + + stream_ptr create_stream(const ExecutionConfig& config) const override; + stream_ptr create_stream(const ExecutionConfig& config, void *handle) const override; + + std::shared_ptr create_kernel_builder() const override; + +#ifdef ENABLE_ONEDNN_FOR_GPU + void create_onednn_engine(const ExecutionConfig& config) override; +#endif + + static std::shared_ptr create(const device::ptr device, runtime_types runtime_type); +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp new file mode 100644 index 00000000000000..abd2946a2d8e56 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine_factory.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/device.hpp" +#include "intel_gpu/runtime/engine.hpp" + +namespace cldnn { +namespace ze { + +// Factory for ze_engine creation. It's moved outside of ze_engine class to avoid possible L0 includes conflict +// between different engines in engine.cpp file +std::shared_ptr create_ze_engine(const device::ptr device, runtime_types runtime_type); + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp new file mode 100644 index 00000000000000..6fb52b9cac2d30 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_event.hpp" +#include "ze/ze_common.hpp" + +#include +#include +#include + +using namespace cldnn; +using namespace ze; + +void ze_event::reset() { + event::reset(); + OV_ZE_EXPECT(zeEventHostReset(m_event)); +} + +void ze_event::wait_impl() { + OV_ZE_EXPECT(zeEventHostSynchronize(m_event, endless_wait)); +} + +void ze_event::set_impl() { + OV_ZE_EXPECT(zeEventHostSignal(m_event)); +} + +bool ze_event::is_set_impl() { + auto ret = zeEventQueryStatus(m_event); + switch (ret) { + case ZE_RESULT_SUCCESS: + return true; + break; + case ZE_RESULT_NOT_READY: + return false; + break; + default: + OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret)); + break; + } +} + +std::optional ze_event::query_timestamp() { + if (!m_factory.is_profiling_enabled()) { + return std::nullopt; + } + ze_kernel_timestamp_result_t timestamp{}; + OV_ZE_EXPECT(zeEventQueryKernelTimestamp(m_event, ×tamp)); + return timestamp; +} + +ze_event_handle_t ze_event::get_handle() const { + return m_event; +} + +bool ze_event::get_profiling_info_impl(std::list& info) { + auto opt_timestamp = query_timestamp(); + if (!opt_timestamp.has_value()) { + return true; + } + ze_kernel_timestamp_result_t timestamp = opt_timestamp.value(); + auto &dev_info = m_factory.get_engine().get_device_info(); + auto wallclock_time = timestamp_to_duration(dev_info, timestamp.global); + auto exec_time = timestamp_to_duration(dev_info, timestamp.context); + + auto period_exec = std::make_shared(timestamp_to_duration(dev_info, timestamp.context)); + auto period_submit = std::make_shared(wallclock_time - exec_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} + +ze_event::~ze_event() { + OV_ZE_WARN(zeEventDestroy(m_event)); +} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp new file mode 100644 index 00000000000000..676208490f92d1 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" +#include "ze_event_pool.hpp" + +namespace cldnn { +namespace ze { + +// L0 event. Can be either in signaled state or not signaled state. +struct ze_event : public ze_base_event { +public: + // Take ownership of event handle + ze_event(uint64_t queue_stamp, const ze_base_event_factory& factory, ze_event_handle_t ev, std::shared_ptr event_pool) + : ze_base_event(queue_stamp) + , m_event_pool(event_pool) + , m_factory(factory) + , m_event(ev) { + // Ensure event handle is not null + OPENVINO_ASSERT(ev != nullptr, "[GPU] Trying to create event with null handle"); + } + ze_event(const ze_event &) = delete; + ze_event& operator=(const ze_event &) = delete; + ~ze_event(); + void reset() override; + + std::optional query_timestamp() override; + ze_event_handle_t get_handle() const override; + bool get_profiling_info_impl(std::list& info) override; + +protected: + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + // TODO: Implement add_event_handler_impl + // bool add_event_handler_impl(event_handler, void*) override; + + std::shared_ptr m_event_pool; + const ze_base_event_factory& m_factory; + ze_event_handle_t m_event; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp new file mode 100644 index 00000000000000..28bfd4a5b980d6 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_event_factory.hpp" +#include "ze_common.hpp" +#include "ze_event.hpp" + +#include "zex_event.h" + +using namespace cldnn; +using namespace ze; + +ze_event_factory::ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity) +: ze_base_event_factory(engine, enable_profiling) +, m_current_pool(nullptr) +, m_capacity(capacity) +, m_num_used(0) { } + +event::ptr ze_event_factory::create_event(uint64_t queue_stamp) { + std::lock_guard lock(_mutex); + + if (m_num_used >= m_capacity || !m_current_pool) { + m_num_used = 0; + ze_event_pool_flags_t flags = is_profiling_enabled() ? ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP : 0; + flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + m_current_pool = std::make_shared(m_engine, m_capacity, flags); + } + + ze_event_handle_t event; + ze_event_desc_t event_desc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, + nullptr, + m_num_used++, + ZE_EVENT_SCOPE_FLAG_HOST, + 0 + }; + OV_ZE_EXPECT(zeEventCreate(m_current_pool->m_handle, &event_desc, &event)); + + return std::make_shared(queue_stamp, *this, event, m_current_pool); +} diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp new file mode 100644 index 00000000000000..d54aff8ecce344 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_factory.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event_factory.hpp" +#include "ze_event_pool.hpp" + +#include "mutex" + +namespace cldnn { +namespace ze { + +// Interface for creating l0 events using event pools +struct ze_event_factory : public ze_base_event_factory { +public: + ze_event_factory(const ze_engine &engine, bool enable_profiling, uint32_t capacity = 255); + event::ptr create_event(uint64_t queue_stamp) override; +protected: + std::mutex _mutex; + std::shared_ptr m_current_pool; + const uint32_t m_capacity; + uint32_t m_num_used; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp new file mode 100644 index 00000000000000..74e040c6831dda --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_event_pool.hpp" +#include "ze_event.hpp" +#include "ze_common.hpp" + +namespace cldnn { +namespace ze { + +ze_event_pool::ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags) + : m_engine(engine) { + ze_event_pool_desc_t event_pool_desc = { + ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + nullptr, + flags, + capacity + }; + auto device = engine.get_device(); + OV_ZE_EXPECT(zeEventPoolCreate(engine.get_context(), &event_pool_desc, 1, &device, &m_handle)); +} + +ze_event_pool::~ze_event_pool() { + zeEventPoolDestroy(m_handle); +} +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp new file mode 100644 index 00000000000000..98d5f485b645cb --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_event_pool.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_engine.hpp" + +namespace cldnn { +namespace ze { + +// RAII wrapper for Level Zero event pool +struct ze_event_pool { + ze_event_pool(const ze_engine& engine, uint32_t capacity, ze_event_pool_flags_t flags); + ~ze_event_pool(); + ze_event_pool(const ze_event_pool&) = delete; + ze_event_pool& operator=(const ze_event_pool&) = delete; + + using ptr = std::shared_ptr; + + ze_event_pool_handle_t m_handle; + const ze_engine& m_engine; +}; + +} // namespace ze +} // namespace cldnn + diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp new file mode 100644 index 00000000000000..f71741c19f6c6c --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.cpp @@ -0,0 +1,155 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_events.hpp" +#include "ze_common.hpp" + +#include +#include +#include + +using namespace cldnn; +using namespace ze; + +void ze_events::wait_impl() { + if (_last_ze_event) { + OV_ZE_EXPECT(zeEventHostSynchronize(_last_ze_event, endless_wait)); + } +} + +void ze_events::set_impl() { + // Call wait_impl to be in line with ocl_events + wait_impl(); +} + +bool ze_events::is_set_impl() { + if (!_last_ze_event) { + return true; + } + + auto ret = zeEventQueryStatus(_last_ze_event); + switch (ret) { + case ZE_RESULT_SUCCESS: + return true; + break; + case ZE_RESULT_NOT_READY: + return false; + break; + default: + OPENVINO_THROW("[GPU] Query event returned unexpected value: ", std::to_string(ret)); + break; + } +} + +bool ze_events::get_profiling_info_impl(std::list& info) { + // The goal is to sum up all disjoint durations of its projection on the time axis + std::vector all_global_timestamps; + std::vector all_context_timestamps; + + auto add_or_merge = [](std::vector& all_timestamps, const ze_kernel_timestamp_data_t& ts) { + auto it = all_timestamps.begin(); + bool merged = false; + auto target_timestamp = ts; + while (it != all_timestamps.end()) { + auto& timestamp = *it; + bool disjoint = timestamp.kernelEnd < target_timestamp.kernelStart || timestamp.kernelStart > target_timestamp.kernelEnd; + bool equal = timestamp.kernelEnd == target_timestamp.kernelEnd && timestamp.kernelStart == target_timestamp.kernelStart; + if (!disjoint) { + if (equal) { + if (!merged) { + merged = true; + break; + } else { + it = all_timestamps.erase(it); + } + } else { + if (!merged) { + timestamp.kernelStart = std::min(timestamp.kernelStart, target_timestamp.kernelStart); + timestamp.kernelEnd = std::max(timestamp.kernelEnd, target_timestamp.kernelEnd); + target_timestamp = timestamp; + merged = true; + it++; + } else { + if (timestamp.kernelEnd > target_timestamp.kernelEnd) { + it--; + it->kernelEnd = target_timestamp.kernelEnd; + it++; + } + it = all_timestamps.erase(it); + } + } + } else { + it++; + } + } + + if (!merged) { + all_timestamps.push_back(target_timestamp); + } + }; + + if (_events.empty()) + return false; + + auto device_info = _engine.get_device_info(); + + auto get_total_exec_time = [&device_info](std::vector& all_timestamps) { + std::chrono::nanoseconds total_time{0}; + for (const auto& ts : all_timestamps) { + total_time += timestamp_to_duration(device_info, ts); + } + + return total_time; + }; + + // Submission time is calculated as difference between merged context and wallclock intervals + // May probably be more accurate if we sum all sub-intervals of wallclock timestamps not covered by execution intervals + using intervals_t = std::vector; + auto get_submission_time = [&device_info](const intervals_t& s_timestamps, + const intervals_t& e_timestamps) { + auto get_minmax = [](const intervals_t& timestamps) { + uint64_t min_val = std::min(timestamps.begin(), timestamps.end(), + [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { + return lhs->kernelStart < rhs->kernelStart; + })->kernelStart; + uint64_t max_val = std::max(timestamps.begin(), timestamps.end(), + [](const intervals_t::const_iterator& lhs, const intervals_t::const_iterator& rhs) { + return lhs->kernelEnd < rhs->kernelEnd; + })->kernelEnd; + + return ze_kernel_timestamp_data_t{min_val, max_val}; + }; + + auto submission_interval = get_minmax(s_timestamps); + auto exec_interval = get_minmax(e_timestamps); + + auto wallclock_time = timestamp_to_duration(device_info, submission_interval); + auto exec_time = timestamp_to_duration(device_info, exec_interval); + + return wallclock_time - exec_time; + }; + + for (size_t i = 0; i < _events.size(); i++) { + auto be = downcast(_events[i].get()); + auto opt_timestamp = be->query_timestamp(); + if (!opt_timestamp.has_value()) { + continue; + } + ze_kernel_timestamp_result_t timestamp = opt_timestamp.value(); + + add_or_merge(all_global_timestamps, timestamp.global); + add_or_merge(all_context_timestamps, timestamp.context); + } + + auto submit_time = get_submission_time(all_global_timestamps, all_context_timestamps); + auto exec_time = get_total_exec_time(all_context_timestamps); + + auto period_exec = std::make_shared(exec_time); + auto period_submit = std::make_shared(submit_time); + + info.push_back({ instrumentation::profiling_stage::executing, period_exec }); + info.push_back({ instrumentation::profiling_stage::submission, period_submit }); + + return true; +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp new file mode 100644 index 00000000000000..b38a38303e8b59 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_events.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_base_event.hpp" +#include "ze_engine.hpp" + +#include + +namespace cldnn { +namespace ze { + +struct ze_events : public ze_base_event { +public: + ze_events(std::vector const& ev, const ze_engine &engine) + : ze_base_event(0) + , _engine(engine) { + process_events(ev); + } + + void reset() override { + event::reset(); + _events.clear(); + } + + std::optional query_timestamp() override { return std::nullopt; } + ze_event_handle_t get_handle() const override { return _last_ze_event; } + bool get_profiling_info_impl(std::list& info) override; + +protected: + void wait_impl() override; + void set_impl() override; + bool is_set_impl() override; + + void process_events(const std::vector& ev) { + for (size_t i = 0; i < ev.size(); i++) { + auto multiple_events = dynamic_cast(ev[i].get()); + if (multiple_events) { + for (size_t j = 0; j < multiple_events->_events.size(); j++) { + if (auto base_ev = dynamic_cast(multiple_events->_events[j].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) { + m_queue_stamp = current_ev_queue_stamp; + _last_ze_event = base_ev->get_handle(); + } + } + _events.push_back(multiple_events->_events[j]); + } + } else { + if (auto base_ev = dynamic_cast(ev[i].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((m_queue_stamp == 0) || (current_ev_queue_stamp > m_queue_stamp)) { + m_queue_stamp = current_ev_queue_stamp; + _last_ze_event = base_ev->get_handle(); + } + } + _events.push_back(ev[i]); + } + } + } + + ze_event_handle_t _last_ze_event = nullptr; + std::vector _events; + const ze_engine &_engine; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp new file mode 100644 index 00000000000000..9d57a571907a89 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel.hpp @@ -0,0 +1,105 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/kernel.hpp" +#include "openvino/core/except.hpp" +#include "ze_common.hpp" +#include "ze_kernel_holder.hpp" + +#include + +namespace cldnn { +namespace ze { + +class ze_kernel : public kernel { +public: + static void create_kernels_from_module(std::shared_ptr module, std::vector &out) { + ze_module_handle_t module_handle = module->get_module_handle(); + uint32_t kernel_count = 0; + OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, nullptr)); + std::vector kernel_names(kernel_count); + // Specification does not mention who is responsible for the returned pointers + // Assume Level Zero owns the pointers and they will remain valid as long as the module resource + OV_ZE_EXPECT(zeModuleGetKernelNames(module_handle, &kernel_count, kernel_names.data())); + + ze_kernel_flags_t flags = 0; + ze_kernel_desc_t kernel_desc = { + ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, flags, nullptr}; + for (auto name_cstr : kernel_names) { + auto name = std::string(name_cstr); + // L0 returns Intel_Symbol_Table_Void_Program that does not correspond to actual kernel + if (name == "Intel_Symbol_Table_Void_Program") { + continue; + } + kernel_desc.pKernelName = name_cstr; + ze_kernel_handle_t kernel_handle; + OV_ZE_EXPECT(zeKernelCreate(module_handle, &kernel_desc, &kernel_handle)); + auto kernel_holder = std::make_shared(kernel_handle, module); + out.push_back(std::make_shared(kernel_holder, name)); + } + } + + ze_kernel(std::shared_ptr kernel, const std::string& kernel_id) + : m_kernel(kernel) + , m_kernel_id(kernel_id) { } + + ze_kernel_handle_t get_kernel_handle() const { return m_kernel->get_kernel_handle(); } + ze_module_handle_t get_module_handle() const { return m_kernel->get_module()->get_module_handle(); } + std::string get_id() const override { return m_kernel_id; } + + std::shared_ptr clone(bool reuse_kernel_handle = false) const override { + if (reuse_kernel_handle) { + return std::make_shared(m_kernel, m_kernel_id); + } else { + ze_kernel_handle_t cloned_handle; + ze_module_handle_t module_handle = get_module_handle(); + ze_kernel_desc_t descriptor; + descriptor.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; + descriptor.pNext = nullptr; + descriptor.flags = 0; + descriptor.pKernelName = m_kernel_id.c_str(); + OV_ZE_EXPECT(zeKernelCreate(module_handle, &descriptor, &cloned_handle)); + auto kernel_holder = std::make_shared(cloned_handle, m_kernel->get_module()); + return std::make_shared(kernel_holder, m_kernel_id); + } + } + + virtual bool is_same(const kernel &other) const override { + auto other_ptr = dynamic_cast(&other); + if (other_ptr == nullptr) { + return false; + } + return get_kernel_handle() == other_ptr->get_kernel_handle(); + } + + std::vector get_binary() const override { + size_t binary_size = 0; + ze_module_handle_t module_handle = get_module_handle(); + OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, nullptr)); + + std::vector binary(binary_size); + OV_ZE_EXPECT(zeModuleGetNativeBinary(module_handle, &binary_size, binary.data())); + + return binary; + } + + std::string get_build_log() const override { + ze_module_build_log_handle_t build_log_handle = m_kernel->get_module()->get_build_log_handle(); + size_t log_size = 0; + OV_ZE_EXPECT(zeModuleBuildLogGetString(build_log_handle, &log_size, nullptr)); + + std::string log(log_size, ' '); + OV_ZE_EXPECT(zeModuleBuildLogGetString(build_log_handle, &log_size, log.data())); + return log; + } + +private: + std::shared_ptr m_kernel; + std::string m_kernel_id; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp new file mode 100644 index 00000000000000..7aae55d6a7f990 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_builder.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2016-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/kernel_builder.hpp" +#include "intel_gpu/runtime/device.hpp" + +#include "ze_device.hpp" +#include "ze_kernel.hpp" +#include "ze_common.hpp" + +namespace cldnn { +namespace ze { + +class ze_kernel_builder : public kernel_builder{ + public: + ze_kernel_builder(const ze_device &device) : m_device(device) {} + + void build_kernels(const void *src, size_t src_bytes, KernelFormat src_format, const std::string &options, std::vector &out) const override { + ze_module_desc_t module_desc = { + ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_NATIVE, + src_bytes, + reinterpret_cast(src), + options.c_str(), + nullptr // specialization constants + }; + switch (src_format) { + case KernelFormat::SOURCE: { + module_desc.format = ze_module_format_oclc; + break; + } + case KernelFormat::NATIVE_BIN: { + module_desc.format = ZE_MODULE_FORMAT_NATIVE; + break; + } + default: + OPENVINO_THROW("[GPU] Trying to build kernel from unexpected format"); + break; + } + ze_module_handle_t module_handle; + ze_module_build_log_handle_t log_handle; + ze_result_t build_result = zeModuleCreate(m_device.get_context(), m_device.get_device(), &module_desc, &module_handle, &log_handle); + if (build_result != ZE_RESULT_SUCCESS) { + size_t log_size = 0; + OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, nullptr)); + std::string log(log_size, ' '); + OV_ZE_EXPECT(zeModuleBuildLogGetString(log_handle, &log_size, log.data())); + OV_ZE_EXPECT(zeModuleBuildLogDestroy(log_handle)); + GPU_DEBUG_INFO << "-------- Kernel build error" << std::endl; + GPU_DEBUG_INFO << log << std::endl; + GPU_DEBUG_INFO << "-------- End of Kernel build error" << std::endl; + OPENVINO_THROW("[GPU] Failed to build module"); + } + auto module_holder = std::make_shared(module_handle, log_handle); + ze_kernel::create_kernels_from_module(module_holder, out); + } + + private: + const ze_device &m_device; +}; +} // namespace ze +} // namespace cldnn + diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp new file mode 100644 index 00000000000000..5d7ab17b6b8367 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_kernel_holder.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_common.hpp" +#include "ze_module_holder.hpp" + +#include + +namespace cldnn { +namespace ze { + +// RAII wrapper for Level Zero kernel +class ze_kernel_holder { +public: + // Take ownership of existing kernel handle + explicit ze_kernel_holder(ze_kernel_handle_t kernel, std::shared_ptr module) : m_kernel(kernel), m_module(module) {} + ze_kernel_holder(const ze_kernel_holder& other) = delete; + ze_kernel_holder& operator=(const ze_kernel_holder& other) = delete; + ~ze_kernel_holder() { + OV_ZE_WARN(zeKernelDestroy(m_kernel)); + } + ze_kernel_handle_t get_kernel_handle() { return m_kernel; } + std::shared_ptr get_module() { return m_module; } +private: + ze_kernel_handle_t m_kernel; + std::shared_ptr m_module; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp new file mode 100644 index 00000000000000..e3d0e924fa2589 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.cpp @@ -0,0 +1,267 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/runtime/utils.hpp" +#include "ze_memory.hpp" +#include "ze/ze_common.hpp" +#include "ze_engine.hpp" +#include "ze_stream.hpp" +#include "ze_event.hpp" +#include +#include + +#ifdef ENABLE_ONEDNN_FOR_GPU +#include +#endif + +namespace cldnn { +namespace ze { +namespace { +static inline cldnn::event::ptr create_event(stream& stream, size_t bytes_count) { + if (bytes_count == 0) { + GPU_DEBUG_TRACE_DETAIL << "Skip memory operation for 0 size tensor" << std::endl; + return stream.create_user_event(true); + } + + return stream.create_base_event(); +} + +std::vector get_ze_events(const std::vector& events) { + std::vector ze_events; + ze_events.reserve(events.size()); + for (const auto& ev : events) { + auto ze_event = downcast(ev.get())->get_handle(); + if (ze_event != nullptr) { + ze_events.push_back(ze_event); + } + } + return ze_events; +} + +} // namespace + +allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const void* mem_ptr) { + ze_memory_allocation_properties_t props{ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES}; + ze_device_handle_t device = nullptr; + OV_ZE_EXPECT(zeMemGetAllocProperties(engine->get_context(), mem_ptr, &props, &device)); + + switch (props.type) { + case ZE_MEMORY_TYPE_DEVICE: return allocation_type::usm_device; + case ZE_MEMORY_TYPE_HOST: return allocation_type::usm_host; + case ZE_MEMORY_TYPE_SHARED: return allocation_type::usm_shared; + default: return allocation_type::unknown; + } + + return allocation_type::unknown; +} + +allocation_type gpu_usm::detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer) { + auto alloc_type = detect_allocation_type(engine, buffer.get()); + OPENVINO_ASSERT(alloc_type == allocation_type::usm_device || + alloc_type == allocation_type::usm_host || + alloc_type == allocation_type::usm_shared, "[GPU] Unsupported USM alloc type: " + to_string(alloc_type)); + return alloc_type; +} + +gpu_usm::gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& buffer, allocation_type type, std::shared_ptr mem_tracker) + : lockable_gpu_mem() + , memory(engine, new_layout, type, mem_tracker) + , _buffer(buffer) + , _host_buffer(engine->get_context(), engine->get_device()) { +} + +gpu_usm::gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& buffer, std::shared_ptr mem_tracker) + : lockable_gpu_mem() + , memory(engine, new_layout, detect_allocation_type(engine, buffer), mem_tracker) + , _buffer(buffer) + , _host_buffer(engine->get_context(), engine->get_device()) { +} + +gpu_usm::gpu_usm(ze_engine* engine, const layout& layout, allocation_type type) + : lockable_gpu_mem() + , memory(engine, layout, type, nullptr) + , _buffer(engine->get_context(), engine->get_device()) + , _host_buffer(engine->get_context(), engine->get_device()) { + auto mem_ordinal = engine->get_device_info().device_memory_ordinal; + switch (get_allocation_type()) { + case allocation_type::usm_host: + _buffer.allocateHost(_bytes_count); + break; + case allocation_type::usm_shared: + _buffer.allocateShared(_bytes_count, mem_ordinal); + break; + case allocation_type::usm_device: + _buffer.allocateDevice(_bytes_count, mem_ordinal); + break; + default: + OPENVINO_THROW("[GPU] Unknown unified shared memory type!"); + } + + m_mem_tracker = std::make_shared(engine, _buffer.get(), layout.bytes_count(), type); +} + +void* gpu_usm::lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) { + std::lock_guard locker(_mutex); + if (0 == _lock_count) { + auto& _ze_stream = downcast(stream); + if (get_allocation_type() == allocation_type::usm_device) { + if (type != mem_lock_type::read) { + throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type."); + } + GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl; + _host_buffer.allocateHost(_bytes_count); + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream.get_queue(), + _host_buffer.get(), + _buffer.get(), + _bytes_count, + nullptr, + 0, + nullptr)); + OV_ZE_EXPECT(zeCommandListHostSynchronize(_ze_stream.get_queue(), endless_wait)); + _mapped_ptr = _host_buffer.get(); + } else { + _mapped_ptr = _buffer.get(); + } + } + _lock_count++; + return _mapped_ptr; +} + +void gpu_usm::unlock(const stream& /* stream */) { + std::lock_guard locker(_mutex); + _lock_count--; + if (0 == _lock_count) { + if (get_allocation_type() == allocation_type::usm_device) { + _host_buffer.freeMem(); + } + _mapped_ptr = nullptr; + } +} + +event::ptr gpu_usm::fill(stream& stream, unsigned char pattern, const std::vector& dep_events, bool blocking) { + auto& _ze_stream = downcast(stream); + auto ev = _ze_stream.create_base_event(); + auto ev_ze = downcast(ev.get())->get_handle(); + auto ze_dep_events = get_ze_events(dep_events); + OV_ZE_EXPECT(zeCommandListAppendMemoryFill(_ze_stream.get_queue(), + _buffer.get(), + &pattern, + sizeof(unsigned char), + _bytes_count, + ev_ze, + ze_dep_events.size(), + ze_dep_events.data())); + if (blocking) { + ev->wait(); + } + return ev; +} + +event::ptr gpu_usm::fill(stream& stream, const std::vector& dep_events, bool blocking) { + return fill(stream, 0, dep_events, blocking); +} + +event::ptr gpu_usm::copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) { + auto result_event = create_event(stream, size); + if (size == 0) + return result_event; + + auto _ze_stream = downcast(&stream); + auto _ze_event = downcast(result_event.get())->get_handle(); + auto src_ptr = reinterpret_cast(data_ptr) + src_offset; + auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; + + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + dst_ptr, + src_ptr, + _bytes_count, + _ze_event, + 0, + nullptr)); + + if (blocking) { + result_event->wait(); + } + + return result_event; +} + +event::ptr gpu_usm::copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) { + auto result_event = create_event(stream, size); + if (size == 0) + return result_event; + + auto _ze_stream = downcast(&stream); + auto _ze_event = downcast(result_event.get())->get_handle(); + OPENVINO_ASSERT(memory_capabilities::is_usm_type(src_mem.get_allocation_type())); + + auto usm_mem = downcast(&src_mem); + auto src_ptr = reinterpret_cast(usm_mem->buffer_ptr()) + src_offset; + auto dst_ptr = reinterpret_cast(buffer_ptr()) + dst_offset; + + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + dst_ptr, + src_ptr, + _bytes_count, + _ze_event, + 0, + nullptr)); + if (blocking) { + result_event->wait(); + } + + return result_event; +} + +event::ptr gpu_usm::copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const { + auto result_event = create_event(stream, size); + if (size == 0) + return result_event; + + auto _ze_stream = downcast(&stream); + auto _ze_event = downcast(result_event.get())->get_handle(); + auto src_ptr = reinterpret_cast(buffer_ptr()) + src_offset; + auto dst_ptr = reinterpret_cast(data_ptr) + dst_offset; + + OV_ZE_EXPECT(zeCommandListAppendMemoryCopy(_ze_stream->get_queue(), + dst_ptr, + src_ptr, + _bytes_count, + _ze_event, + 0, + nullptr)); + if (blocking) { + result_event->wait(); + } + + return result_event; +} + +#ifdef ENABLE_ONEDNN_FOR_GPU +dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const { + auto onednn_engine = _engine->get_onednn_engine(); + dnnl::memory dnnl_mem = dnnl::l0_interop::make_memory(desc, onednn_engine, + reinterpret_cast(_buffer.get()) + offset); + return dnnl_mem; +} +#endif + +shared_mem_params gpu_usm::get_internal_params() const { + auto casted = downcast(_engine); + return { + shared_mem_type::shared_mem_usm, // shared_mem_type + static_cast(casted->get_context()), // context handle + static_cast(casted->get_device()), // user_device handle + static_cast(_buffer.get()), // mem handle +#ifdef _WIN32 + nullptr, // surface handle +#else + 0, // surface handle +#endif + 0 // plane + }; +} + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp new file mode 100644 index 00000000000000..c76d6c46cc31fe --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_memory.hpp @@ -0,0 +1,151 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_common.hpp" +#include "ze_engine.hpp" +#include "intel_gpu/runtime/memory.hpp" + +#include +#include +#include + +namespace cldnn { +namespace ze { +struct lockable_gpu_mem { + lockable_gpu_mem() : + _lock_count(0), + _mapped_ptr(nullptr) {} + + std::mutex _mutex; + unsigned _lock_count; + void* _mapped_ptr; +}; + +class UsmHolder { +public: + UsmHolder(ze_context_handle_t context, void* ptr, bool shared_memory = false) : _context(context), _ptr(ptr), _shared_memory(shared_memory) { + if (ptr == nullptr) + OPENVINO_THROW("[GPU] Can not create UsmHolder with nullptr"); + } + UsmHolder(const UsmHolder&) = delete; + UsmHolder& operator=(const UsmHolder&) = delete; + + void* ptr() { return _ptr; } + void memFree() { + if (!_shared_memory && _ptr != nullptr) { + OV_ZE_WARN(zeMemFree(_context, _ptr)); + _ptr = nullptr; + } + } + + ~UsmHolder() { + memFree(); + } +private: + ze_context_handle_t _context; + void* _ptr; + bool _shared_memory = false; +}; + +class UsmMemory { +public: + explicit UsmMemory(ze_context_handle_t context, ze_device_handle_t device) + : _context(context) + , _device(device) {} + + UsmMemory(ze_context_handle_t context, ze_device_handle_t device, void* usm_ptr, size_t offset = 0) + : _context(context) + , _device(device) + , _usm_pointer(std::make_shared(_context, reinterpret_cast(usm_ptr) + offset, true)) {} + + void* get() const { return _usm_pointer->ptr(); } + + void allocateHost(size_t size) { + ze_host_mem_alloc_desc_t host_desc = {}; + host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_desc.flags = 0; + host_desc.pNext = nullptr; + + void* memory = nullptr; + OV_ZE_EXPECT(zeMemAllocHost(_context, &host_desc, size, 1, &memory)); + _usm_pointer = std::make_shared(_context, memory); + } + + void allocateShared(size_t size, uint32_t ordinal) { + ze_device_mem_alloc_desc_t device_desc = {}; + device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + device_desc.flags = 0; + device_desc.ordinal = ordinal; + device_desc.pNext = nullptr; + + ze_host_mem_alloc_desc_t host_desc = {}; + host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + host_desc.flags = 0; + host_desc.pNext = nullptr; + + void* memory = nullptr; + OV_ZE_EXPECT(zeMemAllocShared(_context, &device_desc, &host_desc, size, 1, _device, &memory)); + _usm_pointer = std::make_shared(_context, memory); + } + + void allocateDevice(size_t size, uint32_t ordinal) { + ze_device_mem_alloc_desc_t device_desc = {}; + device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + device_desc.flags = 0; + device_desc.ordinal = ordinal; + device_desc.pNext = nullptr; + + void* memory = nullptr; + OV_ZE_EXPECT(zeMemAllocDevice(_context, &device_desc, size, 4096, _device, &memory)); + _usm_pointer = std::make_shared(_context, memory); + } + + void freeMem() { + if (!_usm_pointer) + OPENVINO_THROW("[GPU] Can not free memory of empty UsmHolder"); + _usm_pointer->memFree(); + } + + virtual ~UsmMemory() = default; + +protected: + ze_context_handle_t _context; + ze_device_handle_t _device; + std::shared_ptr _usm_pointer = nullptr; +}; + +struct gpu_usm : public lockable_gpu_mem, public memory { + gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& usm_buffer, allocation_type type, std::shared_ptr mem_tracker); + gpu_usm(ze_engine* engine, const layout& new_layout, const ze::UsmMemory& usm_buffer, std::shared_ptr mem_tracker); + gpu_usm(ze_engine* engine, const layout& layout, allocation_type type); + + void* lock(const stream& stream, mem_lock_type type) override; + void unlock(const stream& stream) override; + const ze::UsmMemory& get_buffer() const { return _buffer; } + ze::UsmMemory& get_buffer() { return _buffer; } + + event::ptr fill(stream& stream, unsigned char pattern, const std::vector& dep_events = {}, bool blocking = true) override; + event::ptr fill(stream& stream, const std::vector& dep_events = {}, bool blocking = true) override; + shared_mem_params get_internal_params() const override; + void* buffer_ptr() const override { return _buffer.get(); } + + event::ptr copy_from(stream& stream, const void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; + event::ptr copy_from(stream& stream, const memory& src_mem, size_t src_offset, size_t dst_offset, size_t size, bool blocking) override; + event::ptr copy_to(stream& stream, void* data_ptr, size_t src_offset, size_t dst_offset, size_t size, bool blocking) const override; +#ifdef ENABLE_ONEDNN_FOR_GPU + dnnl::memory get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const override; +#endif + + static allocation_type detect_allocation_type(const ze_engine* engine, const void* mem_ptr); + static allocation_type detect_allocation_type(const ze_engine* engine, const ze::UsmMemory& buffer); + +protected: + ze::UsmMemory _buffer; + ze::UsmMemory _host_buffer; +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp new file mode 100644 index 00000000000000..fcd5c5840b935f --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_module_holder.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ze_common.hpp" +#include +#include + +namespace cldnn { +namespace ze { + +// RAII wrapper for Level Zero module +class ze_module_holder { +public: + // Take ownership of existing module and build log handles + explicit ze_module_holder(ze_module_handle_t module, ze_module_build_log_handle_t build_log) : m_module(module), m_build_log(build_log) {} + + ze_module_holder(const ze_module_holder& other) = delete; + ze_module_holder& operator=(const ze_module_holder& other) = delete; + ~ze_module_holder() { + OV_ZE_WARN(zeModuleBuildLogDestroy(m_build_log)); + OV_ZE_WARN(zeModuleDestroy(m_module)); + } + ze_module_handle_t get_module_handle() const { return m_module; } + ze_module_build_log_handle_t get_build_log_handle() const { return m_build_log; } + +private: + ze_module_handle_t m_module; + ze_module_build_log_handle_t m_build_log; +}; +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp new file mode 100644 index 00000000000000..1050c9b753fa76 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.cpp @@ -0,0 +1,412 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ze_stream.hpp" +#include "intel_gpu/runtime/memory_caps.hpp" +#include "intel_gpu/runtime/utils.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/properties.hpp" + +#include "ze_counter_based_event_factory.hpp" +#include "ze_event_factory.hpp" +#include "ze_events.hpp" +#include "ze_empty_event.hpp" + +#include "ze_event.hpp" +#include "ze_kernel.hpp" +#include "ze_memory.hpp" +#include "ze_common.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#ifdef ENABLE_ONEDNN_FOR_GPU +#include +#endif + +namespace cldnn { +namespace ze { + +namespace { +inline ze_group_count_t to_group_count(const std::vector& v) { + switch (v.size()) { + case 1: + return {uint32_t(v[0]), uint32_t(1), uint32_t(1)}; + case 2: + return {uint32_t(v[0]), uint32_t(v[1]), uint32_t(1)}; + case 3: + return {uint32_t(v[0]), uint32_t(v[1]), uint32_t(v[2])}; + default: + return {uint32_t(1), uint32_t(1), uint32_t(1)}; + } +} + +template +ze_result_t set_kernel_arg_scalar(ze_kernel_handle_t& kernel, uint32_t idx, const T& val) { + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set scalar " << idx << " (" << ov::element::from().get_type_name() << ")" << val << "\n"; + return zeKernelSetArgumentValue(kernel, idx, sizeof(T), &val); +} + +ze_result_t set_kernel_arg_local_memory(ze_kernel_handle_t& kernel, uint32_t idx, size_t size) { + if (size == 0) + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set arg " << idx << " local memory size: " << size << std::endl; + return zeKernelSetArgumentValue(kernel, idx, size, NULL); +} + +ze_result_t set_kernel_arg(ze_kernel_handle_t& kernel, uint32_t idx, cldnn::memory::cptr mem) { + if (!mem) + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + + OPENVINO_ASSERT(memory_capabilities::is_usm_type(mem->get_allocation_type()), "Unsupported alloc type"); + const auto& buf = std::dynamic_pointer_cast(mem)->get_buffer(); + auto mem_type = std::dynamic_pointer_cast(mem)->get_allocation_type(); + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel << " set arg (" << mem_type << ") " << idx + << " mem: " << buf.get() << " size: " << mem->size() << std::endl; + + auto ptr = buf.get(); + return zeKernelSetArgumentValue(kernel, idx, sizeof(ptr), &ptr); +} + +void set_arguments_impl(ze_kernel_handle_t kernel, + const arguments_desc& args, + const kernel_arguments_data& data) { + using args_t = argument_desc::Types; + using scalar_t = scalar_desc::Types; + + for (uint32_t i = 0; i < static_cast(args.size()); i++) { + ze_result_t status = ZE_RESULT_NOT_READY; + switch (args[i].t) { + case args_t::INPUT: + if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) { + status = set_kernel_arg(kernel, i, data.inputs[args[i].index]); + } + break; + case args_t::INPUT_OF_FUSED_PRIMITIVE: + if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) { + status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]); + } + break; + case args_t::INTERNAL_BUFFER: + if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) { + status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]); + } + break; + case args_t::OUTPUT: + if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) { + status = set_kernel_arg(kernel, i, data.outputs[args[i].index]); + } + break; + case args_t::WEIGHTS: + status = set_kernel_arg(kernel, i, data.weights); + break; + case args_t::BIAS: + status = set_kernel_arg(kernel, i, data.bias); + break; + case args_t::WEIGHTS_ZERO_POINTS: + status = set_kernel_arg(kernel, i, data.weights_zero_points); + break; + case args_t::ACTIVATIONS_ZERO_POINTS: + status = set_kernel_arg(kernel, i, data.activations_zero_points); + break; + case args_t::COMPENSATION: + status = set_kernel_arg(kernel, i, data.compensation); + break; + case args_t::SCALE_TABLE: + status = set_kernel_arg(kernel, i, data.scale_table); + break; + case args_t::SLOPE: + status = set_kernel_arg(kernel, i, data.slope); + break; + case args_t::SCALAR: + if (data.scalars && args[i].index < data.scalars->size()) { + const auto& scalar = (*data.scalars)[args[i].index]; + switch (scalar.t) { + case scalar_t::UINT8: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u8); + break; + case scalar_t::UINT16: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u16); + break; + case scalar_t::UINT32: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u32); + break; + case scalar_t::UINT64: + status = set_kernel_arg_scalar(kernel, i, scalar.v.u64); + break; + case scalar_t::INT8: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s8); + break; + case scalar_t::INT16: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s16); + break; + case scalar_t::INT32: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s32); + break; + case scalar_t::INT64: + status = set_kernel_arg_scalar(kernel, i, scalar.v.s64); + break; + case scalar_t::FLOAT32: + status = set_kernel_arg_scalar(kernel, i, scalar.v.f32); + break; + case scalar_t::FLOAT64: + status = set_kernel_arg_scalar(kernel, i, scalar.v.f64); + break; + default: + break; + } + } + break; + case args_t::CELL: + status = set_kernel_arg(kernel, i, data.cell); + break; + case args_t::SHAPE_INFO: + status = set_kernel_arg(kernel, i, data.shape_info); + break; + case args_t::LOCAL_MEMORY_SIZE: + OPENVINO_ASSERT(args[i].index < data.local_memory_args->size() && data.local_memory_args->at(args[i].index), + "The allocated local memory is necessary to set kernel arguments."); + status = set_kernel_arg_local_memory(kernel, i, data.local_memory_args->at(args[i].index)); + break; + default: + break; + } + if (status != ZE_RESULT_SUCCESS) { + throw std::runtime_error("Error set arg " + std::to_string(i) + ", error code: " + std::to_string(status) + "\n"); + } + } +} + +} // namespace + +ze_stream::ze_stream(const ze_engine &engine, const ExecutionConfig& config) + : stream(config.get_queue_type(), stream::get_expected_sync_method(config)) + , _engine(engine) { + const auto &info = engine.get_device_info(); + + ze_command_queue_desc_t command_queue_desc = {}; + command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC; + command_queue_desc.pNext = nullptr; + command_queue_desc.index = 0; + command_queue_desc.ordinal = info.compute_queue_group_ordinal; + command_queue_desc.flags = m_queue_type == QueueTypes::out_of_order ? 0 : ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; + + zex_intel_queue_copy_operations_offload_hint_exp_desc_t cp_offload_desc = {}; + cp_offload_desc.stype = ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES; + cp_offload_desc.copyOffloadEnabled = true; + cp_offload_desc.pNext = nullptr; + bool use_cp_offload = info.supports_cp_offload; + if (use_cp_offload) { + command_queue_desc.pNext = &cp_offload_desc; + } + + OV_ZE_EXPECT(zeCommandListCreateImmediate(_engine.get_context(), _engine.get_device(), &command_queue_desc, &m_command_list)); + bool use_counter_based_events = m_queue_type == QueueTypes::in_order && info.supports_counter_based_events; + if (use_counter_based_events) { + m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); + } else { + m_ev_factory = std::make_unique(engine, config.get_enable_profiling()); + } + GPU_DEBUG_INFO << "[GPU] Created L0 stream (" + << "use_cp_offload=" << use_cp_offload + << ", use_counter_based_events=" << use_counter_based_events + << ")" << std::endl; +} + +ze_stream::~ze_stream() { +#ifdef ENABLE_ONEDNN_FOR_GPU + // Destroy OneDNN stream before destroying command list + _onednn_stream.reset(); +#endif + if (m_command_list != nullptr) + zeCommandListDestroy(m_command_list); +} + +void ze_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) { + static std::mutex m; + std::lock_guard guard(m); + + auto& ze_kernel = downcast(kernel); + auto kern = ze_kernel.get_kernel_handle(); + set_arguments_impl(kern, args_desc.arguments, args); +} + +event::ptr ze_stream::enqueue_kernel(kernel& kernel, + const kernel_arguments_desc& args_desc, + const kernel_arguments_data& /* args */, + std::vector const& deps, + bool is_output) { + auto& ze_kernel = downcast(kernel); + + auto kern = ze_kernel.get_kernel_handle(); + + std::vector dep_events; + std::vector* dep_events_ptr = nullptr; + if (m_sync_method == SyncMethods::events) { + for (auto& dep : deps) { + if (auto ze_base_ev = std::dynamic_pointer_cast(dep)) { + if (ze_base_ev->get_handle() != nullptr) + dep_events.push_back(ze_base_ev->get_handle()); + } + } + dep_events_ptr = &dep_events; + } else if (m_sync_method == SyncMethods::barriers) { + sync_events(deps, is_output); + } + bool set_output_event = m_sync_method == SyncMethods::events || is_output; + + auto ev = set_output_event ? create_base_event() : std::make_shared(++m_queue_counter); + auto global = to_group_count(args_desc.workGroups.global); + auto local = to_group_count(args_desc.workGroups.local); + ze_group_count_t args = { global.groupCountX / local.groupCountX, global.groupCountY / local.groupCountY, global.groupCountZ / local.groupCountZ }; + OV_ZE_EXPECT(zeKernelSetGroupSize(kern, local.groupCountX, local.groupCountY, local.groupCountZ)); + OV_ZE_EXPECT(zeCommandListAppendLaunchKernel(m_command_list, + kern, + &args, + set_output_event ? std::dynamic_pointer_cast(ev)->get_handle() : nullptr, + dep_events_ptr == nullptr ? 0 : static_cast(dep_events_ptr->size()), + dep_events_ptr == nullptr ? 0 : &dep_events_ptr->front())); + + return ev; +} + +void ze_stream::enqueue_barrier() { + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); +} + +event::ptr ze_stream::enqueue_marker(std::vector const& deps, bool is_output) { + if (deps.empty()) { + auto ev = create_base_event(); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, std::dynamic_pointer_cast(ev)->get_handle(), 0, nullptr)); + return ev; + } + + if (m_sync_method == SyncMethods::events) { + std::vector dep_events; + for (auto& dep : deps) { + if (auto ze_base_ev = std::dynamic_pointer_cast(dep)) { + if (ze_base_ev->get_handle() != nullptr) + dep_events.push_back(ze_base_ev->get_handle()); + } + } + if (dep_events.empty()) + return create_user_event(true); + + auto ev = create_base_event(); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, + std::dynamic_pointer_cast(ev)->get_handle(), + static_cast(dep_events.size()), + &dep_events.front())); + return ev; + } else if (m_sync_method == SyncMethods::barriers) { + sync_events(deps, is_output); + assert(m_last_barrier_ev != nullptr); + return m_last_barrier_ev; + } else { + return create_user_event(true); + } +} + +ze_event::ptr ze_stream::group_events(std::vector const& deps) { + return std::make_shared(deps, _engine); +} + +void ze_stream::wait() { + finish(); +} + +event::ptr ze_stream::create_user_event(bool set) { + auto ev = m_ev_factory->create_event(++m_queue_counter); + if (set) + ev->set(); + + return ev; +} + +event::ptr ze_stream::create_base_event() { + return m_ev_factory->create_event(++m_queue_counter); +} + +std::unique_ptr ze_stream::create_surfaces_lock(const std::vector &mem) const { + // Level Zero egnine currently does not support surfaces lock + return nullptr; +} + +void ze_stream::flush() const { + // Immediate Command List submits commands immediately - no flush impl +} + +void ze_stream::finish() const { + OV_ZE_EXPECT(zeCommandListHostSynchronize(m_command_list, endless_wait)); +} + +void ze_stream::wait_for_events(const std::vector& events) { + bool needs_sync = false; + for (auto& ev : events) { + auto* ze_base_ev = dynamic_cast(ev.get()); + if (ze_base_ev->get_handle() != nullptr) { + ze_base_ev->wait(); + } else { + needs_sync = true; + } + // Block thread and wait for event signal + ev->wait(); + } + + if (needs_sync) { + finish(); + } +} + +void ze_stream::sync_events(std::vector const& deps, bool is_output) { + bool needs_barrier = false; + for (auto& dep : deps) { + auto* ze_base_ev = dynamic_cast(dep.get()); + assert(ze_base_ev != nullptr); + if (ze_base_ev->get_queue_stamp() > m_last_barrier) { + needs_barrier = true; + } + } + + if (needs_barrier) { + if (is_output) { + m_last_barrier_ev = std::dynamic_pointer_cast(create_base_event()); + m_last_barrier_ev->set_queue_stamp(m_queue_counter.load()); + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, m_last_barrier_ev->get_handle(), 0, nullptr)); + } else { + OV_ZE_EXPECT(zeCommandListAppendBarrier(m_command_list, nullptr, 0, nullptr)); + } + m_last_barrier = ++m_queue_counter; + } + + if (!m_last_barrier_ev) { + m_last_barrier_ev = std::dynamic_pointer_cast(create_user_event(true)); + m_last_barrier_ev->set_queue_stamp(m_queue_counter.load()); + } +} + +#ifdef ENABLE_ONEDNN_FOR_GPU +dnnl::stream& ze_stream::get_onednn_stream() { + OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue"); + OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices"); + if (!_onednn_stream) { + _onednn_stream = std::make_shared(dnnl::l0_interop::make_stream(_engine.get_onednn_engine(), m_command_list, m_ev_factory->is_profiling_enabled())); + } + + return *_onednn_stream; +} +#endif + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp new file mode 100644 index 00000000000000..06647fda325304 --- /dev/null +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_stream.hpp @@ -0,0 +1,73 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/runtime/event.hpp" +#include "intel_gpu/runtime/stream.hpp" +#include "ze_common.hpp" +#include "ze_engine.hpp" +#include "ze_event.hpp" +#include "ze_base_event_factory.hpp" + +namespace cldnn { +namespace ze { + +class ze_stream : public stream { +public: + ze_command_list_handle_t get_queue() const { return m_command_list; } + + ze_stream(const ze_engine& engine, const ExecutionConfig& config); + ze_stream(ze_stream&& other) + : stream(other.m_queue_type, other.m_sync_method) + , _engine(other._engine) + , m_command_list(other.m_command_list) + , m_queue_counter(other.m_queue_counter.load()) + , m_last_barrier(other.m_last_barrier.load()) + , m_last_barrier_ev(other.m_last_barrier_ev) + , m_ev_factory(other.m_ev_factory.release()) { + other.m_command_list = nullptr; + } + + ~ze_stream(); + + void flush() const override; + void finish() const override; + void wait() override; + + void set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) override; + event::ptr enqueue_kernel(kernel& kernel, + const kernel_arguments_desc& args_desc, + const kernel_arguments_data& args, + std::vector const& deps, + bool is_output = false) override; + event::ptr enqueue_marker(std::vector const& deps, bool is_output) override; + event::ptr group_events(std::vector const& deps) override; + void wait_for_events(const std::vector& events) override; + void enqueue_barrier() override; + event::ptr create_user_event(bool set) override; + event::ptr create_base_event() override; + std::unique_ptr create_surfaces_lock(const std::vector &mem) const override; + +#ifdef ENABLE_ONEDNN_FOR_GPU + dnnl::stream& get_onednn_stream() override; +#endif + +private: + void sync_events(std::vector const& deps, bool is_output = false); + + const ze_engine& _engine; + mutable ze_command_list_handle_t m_command_list = 0; + mutable std::atomic m_queue_counter{0}; + std::atomic m_last_barrier{0}; + std::shared_ptr m_last_barrier_ev = nullptr; + std::unique_ptr m_ev_factory; + +#ifdef ENABLE_ONEDNN_FOR_GPU + std::shared_ptr _onednn_stream = nullptr; +#endif +}; + +} // namespace ze +} // namespace cldnn diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index 2edaf4be161991..3dd2da44dc6d41 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -32,12 +32,12 @@ ov_add_test_target( LINK_LIBRARIES openvino::reference funcSharedTests - OpenCL::NewHeaders # should come before OpenCL::OpenCL - OpenCL::OpenCL LABELS OV GPU ) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) + if(ENABLE_PROXY) target_compile_definitions(${TARGET_NAME} PUBLIC PROXY_PLUGIN_ENABLED) endif() diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp similarity index 99% rename from src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp rename to src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp index 70298a79449971..38c548c4820276 100644 --- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ocl_remote_tensor_tests.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#ifdef OV_GPU_WITH_OCL_RT + #include "openvino/core/preprocess/pre_post_process.hpp" #include "openvino/op/add.hpp" #include "openvino/op/constant.hpp" @@ -2945,3 +2947,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_RemoteTensorDataType, OVRemoteTensorDataType_Test ov::element::Type_t::u16, ov::element::Type_t::u32)), OVRemoteTensorDataType_Test::getTestCaseName); +#endif // OV_GPU_WITH_OCL_RT diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp new file mode 100644 index 00000000000000..b593552272d691 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/ze_remote_tensor_tests.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef OV_GPU_WITH_ZE_RT + +#include "openvino/runtime/intel_gpu/remote_properties.hpp" +#include "openvino/runtime/remote_tensor.hpp" + +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" + +TEST(ZeRemoteContext, smoke_CorrectContextType) { + auto core = ov::Core(); + auto remote_context = core.get_default_context(ov::test::utils::DEVICE_GPU); + ASSERT_EQ(remote_context.get_params().at(ov::intel_gpu::context_type.name()), ov::intel_gpu::ContextType::ZE); +} + +#endif // OV_GPU_WITH_ZE_RT diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp index 09b5052df9dc07..3ca4052b22d01d 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/serialize.cpp @@ -46,7 +46,7 @@ class LSTMSequenceTest : virtual public SerializeBaseTest { class GRUSequenceTest : virtual public SerializeBaseTest { public: - void SetUp() { + void SetUp() override { std::string cacheDirName = "cache_gru"; auto init_shape = ov::PartialShape({1, 30, 512}); auto batch_size = static_cast(init_shape[0].get_length()); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp index 38be79bcb5391e..f2fae883a0f718 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/vlsdpa.cpp @@ -11,7 +11,6 @@ #include "shared_test_classes/base/ov_subgraph.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/engine_configuration.hpp" -#include "openvino/runtime/intel_gpu/ocl/ocl.hpp" #include "openvino/opsets/opset13.hpp" #include "ov_ops/vl_sdpa.hpp" diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt index 1616c9e0bb687a..8b56fedf4d4a06 100644 --- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt @@ -34,6 +34,24 @@ file(GLOB_RECURSE SOURCES_MAIN "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/simple_math.cpp" ) +# Those tests have dependency on OpenCL runtime +# Need to be excluded from the build with a different runtime +file(GLOB_RECURSE SOURCES_WITH_OCL_RT + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/device_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/engine_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/events_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/network_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/module_tests/usm_memory_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/convert_color_gpu_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/cl_mem_input_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_cases/mem_perf_test.cpp" + ) +if (NOT GPU_RT_TYPE STREQUAL "OCL") + foreach (SOURCE_FILE IN LISTS SOURCES_WITH_OCL_RT) + list (REMOVE_ITEM SOURCES_MAIN ${SOURCE_FILE}) + endforeach() +endif() + if (NOT ENABLE_ONEDNN_FOR_GPU) set(EXCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn/") foreach (SOURCE_FILE IN LISTS SOURCES_MAIN) @@ -60,6 +78,7 @@ add_executable(${TARGET_NAME} ${SOURCES_ALL}) target_compile_definitions(${TARGET_NAME} PRIVATE CI_BUILD_NUMBER="") ov_set_threading_interface_for(${TARGET_NAME}) +ov_gpu_set_runtime_interface_for(${TARGET_NAME}) # Workaround to avoid warnings during LTO build if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") @@ -69,7 +88,6 @@ endif() set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_graph - OpenCL::OpenCL gtest gtest_main gflags diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp index 33c6ada95eab2d..822f618b0e49b2 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/device_test.cpp @@ -56,7 +56,7 @@ TEST(devices_test, sort_order_single_vendor) { devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++)); devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++)); - auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list); + auto sorted_list = sort_devices(devices_list); std::vector expected_devices_order = {2, 0, 1, 3, 4}; @@ -77,7 +77,7 @@ TEST(devices_test, sort_order_two_vendors) { devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++)); devices_list.push_back(std::make_shared(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++)); - auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list); + auto sorted_list = sort_devices(devices_list); std::vector expected_devices_order = {3, 2, 0, 1}; @@ -101,7 +101,7 @@ TEST(devices_test, sort_order_three_vendors) { devices_list.push_back(std::make_shared(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++)); devices_list.push_back(std::make_shared(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++)); - auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list); + auto sorted_list = sort_devices(devices_list); std::vector expected_devices_order = {2, 3, 0, 1, 4, 5}; diff --git a/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp index 30225132c35488..871038a6c17f31 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/kernels_cache_test.cpp @@ -4,7 +4,6 @@ #include "test_utils.h" -#include "runtime/ocl/ocl_kernel.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/graph/program.hpp" #include "intel_gpu/graph/network.hpp" @@ -181,12 +180,7 @@ TEST(kernels_cache, reuse_kernels_property) { auto conv1_kern = cache.get_cached_kernel_id(conv1_kernels[idx]); auto conv2_kern = cache.get_cached_kernel_id(conv2_kernels[idx]); ASSERT_EQ(conv1_kern, conv2_kern); - - auto conv1_ocl_kernel = std::dynamic_pointer_cast(conv1_kernels[idx]); - auto conv2_ocl_kernel = std::dynamic_pointer_cast(conv2_kernels[idx]); - if (conv1_ocl_kernel && conv2_ocl_kernel) { - ASSERT_EQ(conv1_ocl_kernel->get_handle().get(), conv2_ocl_kernel->get_handle().get()); - } + ASSERT_TRUE(conv1_kernels[idx]->is_same(*conv2_kernels[idx].get())); } auto& concat1_node = prog->get_node("concat1"); @@ -200,11 +194,6 @@ TEST(kernels_cache, reuse_kernels_property) { auto concat1_kern = cache.get_cached_kernel_id(concat1_kernels[idx]); auto concat2_kern = cache.get_cached_kernel_id(concat2_kernels[idx]); ASSERT_EQ(concat1_kern, concat2_kern); - - auto concat1_ocl_kernel = std::dynamic_pointer_cast(concat1_kernels[idx]); - auto concat2_ocl_kernel = std::dynamic_pointer_cast(concat2_kernels[idx]); - if (concat1_ocl_kernel && concat2_ocl_kernel) { - ASSERT_EQ(concat1_ocl_kernel->get_handle().get(), concat2_ocl_kernel->get_handle().get()); - } + ASSERT_TRUE(concat1_kernels[idx]->is_same(*concat2_kernels[idx].get())); } } diff --git a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp index e4fbb1f3d3b228..be8e390749b801 100644 --- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.cpp @@ -306,7 +306,8 @@ cldnn::ExecutionConfig get_test_default_config(const cldnn::engine& engine, } std::shared_ptr create_test_engine() { - auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl); + auto ret = cldnn::engine::create( + cldnn::device_query::get_default_engine_type(), cldnn::device_query::get_default_runtime_type()); #ifdef ENABLE_ONEDNN_FOR_GPU if (ret->get_device_info().supports_immad) ret->create_onednn_engine({}); diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index b9a904a03f6271..d876d52f97baed 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -9,7 +9,7 @@ set(XETLA_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/xetla/include/" CACHE PATH "P # if(ENABLE_ONEDNN_FOR_GPU) - function(build_onednn_gpu) + function(build_onednn_gpu GPU_RUNTIME) include(ExternalProject) set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build") set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library") @@ -99,11 +99,16 @@ if(ENABLE_ONEDNN_FOR_GPU) set(onednn_gpu_lib "${CMAKE_STATIC_LIBRARY_PREFIX}${DNNL_GPU_LIBRARY_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") set(ONEDNN_GPU_LIB_PATH ${ONEDNN_INSTALL_DIR}/lib/${onednn_gpu_lib} CACHE FILEPATH "Path to oneDNN GPU library") + if(GPU_RUNTIME STREQUAL "L0") + set(ONEDNN_GPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/l0_onednn_gpu CACHE FILEPATH "Path to oneDNN GPU repository") + elseif(GPU_RUNTIME STREQUAL "OCL") + set(ONEDNN_GPU_DIR ${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu CACHE FILEPATH "Path to oneDNN GPU repository") + endif() ExternalProject_Add(onednn_gpu_build # Directory Options: PREFIX "${ONEDNN_PREFIX_DIR}" - SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu" + SOURCE_DIR "${ONEDNN_GPU_DIR}" BINARY_DIR "${ONEDNN_BUILD_DIR}" INSTALL_DIR "${ONEDNN_INSTALL_DIR}" # Configure Step Options: @@ -113,7 +118,7 @@ if(ENABLE_ONEDNN_FOR_GPU) "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW" "-DDNNL_TARGET_ARCH=${ONEDNN_TARGET_ARCH}" "-DDNNL_CPU_RUNTIME=NONE" - "-DDNNL_GPU_RUNTIME=OCL" + "-DDNNL_GPU_RUNTIME=${GPU_RUNTIME}" "-DDNNL_LIBRARY_NAME=${DNNL_GPU_LIBRARY_NAME}" "-DCMAKE_INSTALL_PREFIX=${ONEDNN_INSTALL_DIR}" "-DDNNL_ENABLE_CONCURRENT_EXEC=ON" @@ -155,13 +160,12 @@ if(ENABLE_ONEDNN_FOR_GPU) DEPENDEES install # Ensures this runs after install ) endif() - set(LIB_INCLUDE_DIRS "${ONEDNN_INSTALL_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/ngen" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/jit/config" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src/gpu/intel/gemm/jit/include" - "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/third_party/ngen") + "${ONEDNN_GPU_DIR}/src" + "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/ngen" + "${ONEDNN_GPU_DIR}/src/gpu/intel/jit/config" + "${ONEDNN_GPU_DIR}/src/gpu/intel/gemm/jit/include" + "${ONEDNN_GPU_DIR}/third_party/ngen") set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU DNNL_DLL DNNL_DLL_EXPORTS @@ -188,5 +192,5 @@ if(ENABLE_ONEDNN_FOR_GPU) COMPONENT ${OV_CPACK_COMP_CORE}) endif() endfunction() - build_onednn_gpu() + build_onednn_gpu(${GPU_RT_TYPE}) endif() diff --git a/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu new file mode 160000 index 00000000000000..9cb2921cc19dc9 --- /dev/null +++ b/src/plugins/intel_gpu/thirdparty/l0_onednn_gpu @@ -0,0 +1 @@ +Subproject commit 9cb2921cc19dc9ca74309bb65149ac002ea56b2c diff --git a/thirdparty/dependencies.cmake b/thirdparty/dependencies.cmake index 371ea475b13214..5a8b5ef918a3c1 100644 --- a/thirdparty/dependencies.cmake +++ b/thirdparty/dependencies.cmake @@ -68,7 +68,7 @@ endif() # LevelZero # -if(ENABLE_INTEL_NPU) +if(ENABLE_INTEL_GPU OR ENABLE_INTEL_NPU) if(ENABLE_SYSTEM_LEVEL_ZERO) pkg_search_module(level_zero QUIET IMPORTED_TARGET diff --git a/thirdparty/level_zero/CMakeLists.txt b/thirdparty/level_zero/CMakeLists.txt index b78f115b32f9a9..0fbbd2ad303106 100644 --- a/thirdparty/level_zero/CMakeLists.txt +++ b/thirdparty/level_zero/CMakeLists.txt @@ -30,7 +30,24 @@ endif() set(CMAKE_COMPILE_WARNING_AS_ERROR OFF) add_subdirectory(level-zero EXCLUDE_FROM_ALL) -set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES $) +set(ZE_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/include/") +file(GLOB_RECURSE COMPUTE_RUNTIME_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime/*.h") +file(GLOB_RECURSE LEVEL_ZERO_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include") +add_custom_command(OUTPUT "${ZE_INCLUDE_DIR}" + COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/level-zero/include" "${ZE_INCLUDE_DIR}/level_zero" + COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/compute-runtime" "${ZE_INCLUDE_DIR}/level_zero" + DEPENDS "${COMPUTE_RUNTIME_HEADERS}" "${LEVEL_ZERO_HEADERS}" + COMMENT "Copying Level Zero and compute-runtime headers..." +) +add_custom_target(prepare_ze_headers ALL DEPENDS "${ZE_INCLUDE_DIR}") +add_dependencies(ze_loader prepare_ze_headers) + +# Allow include patterns with and without level-zero/ prefix +set_property(TARGET ze_loader APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $ + $ +) + # This VERSION file created by L0 may cause compilation issue of oneTBB headers, so remove it file(REMOVE "${CMAKE_BINARY_DIR}/VERSION") diff --git a/thirdparty/level_zero/compute-runtime/ze_intel_gpu.h b/thirdparty/level_zero/compute-runtime/ze_intel_gpu.h new file mode 100644 index 00000000000000..d7c9f3f56aa6bf --- /dev/null +++ b/thirdparty/level_zero/compute-runtime/ze_intel_gpu.h @@ -0,0 +1,819 @@ +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac +/* + * Copyright (C) 2020-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZE_INTEL_GPU_H +#define _ZE_INTEL_GPU_H + +#include + +#include "ze_stypes.h" + +#if defined(__cplusplus) +#pragma once +extern "C" { +#endif + +#include + +#define ZE_INTEL_GPU_VERSION_MAJOR 0 +#define ZE_INTEL_GPU_VERSION_MINOR 1 + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME +/// @brief Module DP properties driver extension name +#define ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME "ZE_intel_experimental_device_module_dp_properties" +#endif // ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Module DP properties driver extension Version(s) +typedef enum _ze_intel_device_module_dp_properties_exp_version_t { + ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_module_dp_properties_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported Dot Product flags +typedef uint32_t ze_intel_device_module_dp_exp_flags_t; +typedef enum _ze_intel_device_module_dp_exp_flag_t { + ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DP4A = ZE_BIT(0), ///< Supports DP4A operation + ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS = ZE_BIT(1), ///< Supports DPAS operation + ZE_INTEL_DEVICE_MODULE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_module_dp_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Device Module dot product properties queried using +/// ::zeDeviceGetModuleProperties +/// +/// @details +/// - This structure may be passed to ::zeDeviceGetModuleProperties, via +/// `pNext` member of ::ze_device_module_properties_t. +/// @brief Device module dot product properties +typedef struct _ze_intel_device_module_dp_exp_properties_t { + ze_structure_type_ext_t stype = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES; ///< [in] type of this structure + void *pNext; ///< [in,out][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains sType and pNext). + ze_intel_device_module_dp_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_module_dp_flag_t +} ze_intel_device_module_dp_exp_properties_t; + +#ifndef ZE_INTEL_COMMAND_LIST_MEMORY_SYNC +/// @brief Cmd List memory sync extension name +#define ZE_INTEL_COMMAND_LIST_MEMORY_SYNC "ZE_intel_experimental_command_list_memory_sync" +#endif // ZE_INTEL_COMMAND_LIST_MEMORY_SYNC + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Cmd List memory sync extension Version(s) +typedef enum _ze_intel_command_list_memory_sync_exp_version_t { + ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} ze_intel_command_list_memory_sync_exp_version_t; + +#ifndef ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC +/// @brief stype for _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t +#endif + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extended descriptor for cmd list memory sync +/// +/// @details +/// - Implementation must support ::ZE_intel_experimental_command_list_memory_sync extension +/// - May be passed to ze_device_properties_t through pNext. +typedef struct _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint32_t cmdListWaitOnMemoryDataSizeInBytes; /// Defines supported data size for zexCommandListAppendWaitOnMemory[64] API +} ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t; + +#ifndef ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME +/// @brief Event sync mode extension name +#define ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME "ZEX_intel_experimental_event_sync_mode" +#endif // ZE_INTEL_EVENT_SYNC_MODE_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Event sync mode extension Version(s) +typedef enum _zex_intel_event_sync_mode_exp_version_t { + ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} zex_intel_event_sync_mode_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported event sync mode flags +typedef uint32_t zex_intel_event_sync_mode_exp_flags_t; +typedef enum _zex_intel_event_sync_mode_exp_flag_t { + ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT = ZE_BIT(0), ///< Low power host synchronization mode, for better CPU utilization + ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT = ZE_BIT(1), ///< Generate interrupt when Event is signalled on Device + ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT = ZE_BIT(2), ///< Host synchronization APIs wait for external interrupt. Can be used only for Events created via zexCounterBasedEventCreate + ZEX_INTEL_EVENT_SYNC_MODE_EXP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff + +} zex_intel_event_sync_mode_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extended descriptor for event sync mode +/// +/// @details +/// - Implementation must support ::ZEX_intel_experimental_event_sync_mode extension +/// - May be passed to ze_event_desc_t through pNext. +typedef struct _zex_intel_event_sync_mode_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + zex_intel_event_sync_mode_exp_flags_t syncModeFlags; /// valid combination of ::ze_intel_event_sync_mode_exp_flag_t + uint32_t externalInterruptId; /// External interrupt id. Used only when ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT flag is set +} zex_intel_event_sync_mode_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare zex_intel_queue_allocate_msix_hint_exp_desc_t +typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t zex_intel_queue_allocate_msix_hint_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue descriptor for allocating unique msix. This structure may be +/// passed as pNext member of ::ze_command_queue_desc_t. + +typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + ze_bool_t uniqueMsix; ///< [in] If set, try to allocate unique msix for command queue. + ///< If not set, driver will follow default behaviour. It may share msix for signaling completion with other queues. + ///< Number of unique msixes may be limited. On unsuccessful allocation, queue or immediate cmd list creation API fallbacks to default behaviour. + +} zex_intel_queue_allocate_msix_hint_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue descriptor for enabling copy operations offload. This structure may be +/// passed as pNext member of ::ze_command_queue_desc_t. + +typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + ze_bool_t copyOffloadEnabled; ///< [in] If set, try to offload copy operations to different engines. Applicable only for compute queues. + ///< This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics. + ///< If not set, driver will follow default behaviour. Copy operations will be submitted to same engine as compute operations. + +} zex_intel_queue_copy_operations_offload_hint_exp_desc_t; + +#ifndef ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME +/// @brief Queue copy operations offload hint extension name +#define ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME "ZEX_intel_experimental_queue_copy_operations_offload_hint" +#endif // ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Queue copy operations offload hint extension version(s) +typedef enum _zex_intel_queue_copy_operations_offload_hint_exp_version_t { + ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} zex_intel_queue_copy_operations_offload_hint_exp_version_t; + +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue flag for enabling copy operations offload +/// +/// If set, try to offload copy operations to different engines. Applicable only for compute queues. +/// This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics. +#define ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT ZE_BIT(2) + +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +#ifndef ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME +/// @brief Extension name for query to read the Intel Level Zero Driver Version String +#define ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME "ZE_intel_get_driver_version_string" +#endif // ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query to read the Intel Level Zero Driver Version String extension version(s) +typedef enum _ze_intel_get_driver_version_string_exp_version_t { + ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} ze_intel_get_driver_version_string_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported 2D Block Array flags +typedef uint32_t ze_intel_device_block_array_exp_flags_t; +typedef enum _ze_intel_device_block_array_exp_flag_t { + ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_STORE = ZE_BIT(0), ///< Supports store operation + ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_LOAD = ZE_BIT(1), ///< Supports load operation + ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_block_array_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME +/// @brief Device 2D block array properties driver extension name +#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME "ZE_intel_experimental_device_block_array_properties" +#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + +/// @brief Device 2D block array properties queried using +/// ::zeDeviceGetProperties +/// +/// @details +/// - This structure may be passed to ::zeDeviceGetProperties, via +/// `pNext` member of ::ze_device_properties_t. +/// @brief Device 2D block array properties + +typedef struct _ze_intel_device_block_array_exp_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + void *pNext; ///< [in,out][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains sType and pNext). + ze_intel_device_block_array_exp_flags_t flags; ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t +} ze_intel_device_block_array_exp_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Device 2D block array properties driver extension versions +typedef enum _ze_intel_device_block_array_exp_properties_version_t { + ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_intel_device_block_array_exp_properties_version_t; + +/// @brief Query to read the Intel Level Zero Driver Version String +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - The Driver Version String will be in the format: +/// - Major.Minor.Patch+Optional per semver guidelines https://semver.org/#spec-item-10 +/// @returns +/// - ::ZE_RESULT_SUCCESS +ZE_APIEXPORT ze_result_t ZE_APICALL +zeIntelGetDriverVersionString( + ze_driver_handle_t hDriver, ///< [in] Driver handle whose version is being read. + char *pDriverVersion, ///< [in,out] pointer to driver version string. + size_t *pVersionSize); ///< [in,out] pointer to the size of the driver version string. + ///< if size is zero, then the size of the version string is returned. + +/// @brief Get Kernel Program Binary +/// +/// @details +/// - A valid kernel handle must be created with zeKernelCreate. +/// - Returns Intel Graphics Assembly (GEN ISA) format binary program data for kernel handle. +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// @returns +/// - ::ZE_RESULT_SUCCESS + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZEX_MEMORY_FREE_CALLBACK_EXT_NAME +/// @brief Memory Free Callback Extension Name +#define ZEX_MEMORY_FREE_CALLBACK_EXT_NAME "ZEX_extension_memory_free_callback" + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Memory Free Callback Extension Version(s) +typedef enum _zex_memory_free_callback_ext_version_t { + ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_FORCE_UINT32 = 0x7fffffff ///< Value marking end of ZEX_MEMORY_FREE_CALLBACK_EXT_VERSION_* ENUMs + +} zex_memory_free_callback_ext_version_t; + +#ifndef ZEX_STRUCTURE_TYPE_MEMORY_FREE_CALLBACK_EXT_DESC +/// @brief stype for _zex_memory_free_callback_ext_desc_t +#endif + +/** + * @brief Callback function type for memory free events. + * + * This function is called when a memory free operation occurs. + * + * @param pUserData Pointer to user-defined data passed to the callback. + */ +typedef void (*zex_mem_free_callback_fn_t)(void *pUserData); + +/** + * @brief Descriptor for a memory free callback extension. + * + * This structure is used to specify a callback function that will be invoked when memory is freed. + * + * Members: + * - stype: Specifies the type of this structure. + * - pNext: Optional pointer to an extension-specific structure; must be null or point to a structure containing stype and pNext. + * - pfnCallback: Callback function to be called when memory is freed. + * - pUserData: Optional user data to be passed to the callback function. + */ +typedef struct _zex_memory_free_callback_ext_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + zex_mem_free_callback_fn_t pfnCallback; // [in] callback function to be called on memory free + void *pUserData; // [in][optional] user data passed to callback +} zex_memory_free_callback_ext_desc_t; + +/** + * @brief Registers a callback to be invoked when memory is freed. + * + * This function allows the user to register a callback that will be called + * whenever the specified memory is freed within the given context. + * + * @param hContext + * [in] Handle to the context in which the memory was allocated. + * @param hFreeCallbackDesc + * [in] Pointer to a descriptor specifying the callback function and its parameters. + * @param ptr + * [in] Pointer to the memory for which the free callback is to be registered. + * +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_ARGUMENT +/// + `nullptr == hFreeCallbackDesc` +/// + `nullptr == ptr` + * + * @note The callback will be invoked when the specified memory is freed. + */ +ZE_APIEXPORT ze_result_t ZE_APICALL zexMemFreeRegisterCallbackExt(ze_context_handle_t hContext, zex_memory_free_callback_ext_desc_t *hFreeCallbackDesc, void *ptr); +#endif // ZEX_MEMORY_FREE_CALLBACK_EXT_NAME + +#ifndef ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME +/// @brief Get Kernel Program Binary experimental name +#define ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME "ZE_intel_experimental_kernel_get_program_binary" +#endif // ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intel Kernel Get Binary Extension Version(s) +typedef enum _ze_intel_kernel_get_binary_exp_version_t { + ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_intel_kernel_get_binary_exp_version_t; + +ZE_APIEXPORT ze_result_t ZE_APICALL +zeIntelKernelGetBinaryExp( + ze_kernel_handle_t hKernel, ///< [in] Kernel handle + size_t *pSize, ///< [in, out] pointer to variable with size of GEN ISA binary + char *pKernelBinary ///< [in,out] pointer to storage area for GEN ISA binary function +); + +#ifndef ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME +/// @brief DRM format modifier extension name +#define ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME "ZE_intel_experimental_drm_format_modifier" +#endif // ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief DRM format modifier extension Version(s) +typedef enum _ze_intel_drm_format_modifier_exp_version_t { + ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_INTEL_DRM_FORMAT_MODIFIER_EXP_VERSION_FORCE_UINT32 = 0x7fffffff +} ze_intel_drm_format_modifier_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Image DRM format modifier properties +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_image_desc_t, +/// when using a DRM format modifier. +/// - Properties struct for providing user with the selected drm format modifier for the image +/// - This is useful if the application wants to export the image to another API that requires the DRM format modifier +/// - The application can query the chosen DRM format modifier for the image. +/// - The application can use this information to choose a DRM format modifier for the image during creation +typedef struct _ze_intel_image_selected_format_modifier_exp_properties_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [out] DRM format modifier +} ze_intel_image_selected_format_modifier_exp_properties_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Image DRM format modifier create list +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_image_desc_t, +/// when providing a list of DRM format modifiers to choose from during image creation. +/// - This is a descriptor for creating image with the specified list of drm format modifier +/// - If the user passes a list struct, then implementation chooses one from the list of drm modifiers as it sees fit. +/// - If user wants to pass a single drm modifier then they can set the drmFormatModifierCount to 1 and pass the single drm modifier in pDrmFormatModifiers +typedef struct _ze_intel_image_format_modifier_create_list_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint32_t drmFormatModifierCount; ///< [in] number of DRM format modifiers in the list + uint64_t *pDrmFormatModifiers; ///< [in][range(0, drmFormatModifierCount)] array of DRM format modifiers +} ze_intel_image_format_modifier_create_list_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Image DRM format modifier import descriptor +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_image_desc_t, +/// when importing an image with a specific DRM format modifier. +/// - The pNext chain is setup accordingly in ze_image_desc_t prior to calling zeImageCreate API +typedef struct _ze_intel_image_format_modifier_import_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [in] DRM format modifier to use for the image +} ze_intel_image_format_modifier_import_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Buffer DRM format modifier create list +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_device_mem_alloc_desc_t, +/// when providing a list of DRM format modifiers to choose from during buffer creation. +/// - This is a descriptor for creating buffer with the specified list of drm format modifier +/// - If the user passes a list struct, then implementation chooses one from the list of drm modifiers as it sees fit. +/// - If user wants to pass a single drm modifier then they can set the drmFormatModifierCount to 1 and pass the single drm modifier in pDrmFormatModifiers +/// - The pNext chain is setup accordingly in ze_device_mem_alloc_desc_t prior to calling zeMemAllocDevice API +typedef struct _ze_intel_mem_format_modifier_create_list_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint32_t drmFormatModifierCount; ///< [in] number of DRM format modifiers in the list + uint64_t *pDrmFormatModifiers; ///< [in][range(0, drmFormatModifierCount)] array of DRM format modifiers +} ze_intel_mem_format_modifier_create_list_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Buffer DRM format modifier import descriptor +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_device_mem_alloc_desc_t, +/// when importing a buffer with a specific DRM format modifier. +/// - This descriptor must be used in conjunction with ze_external_memory_import_fd_t. If not, implementation will return an error. +/// - The pNext chain is setup accordingly in ze_device_mem_alloc_desc_t prior to calling zeMemAllocDevice API +typedef struct _ze_intel_mem_format_modifier_import_exp_desc_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [in] DRM format modifier to use for the buffer +} ze_intel_mem_format_modifier_import_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Buffer DRM format modifier properties +/// +/// @details +/// - This structure may be passed as pNext member of ::ze_memory_allocation_properties_t, +/// when querying the DRM format modifier of a buffer. +/// - Properties struct for providing user with the selected drm format modifier for the buffer +/// - This is useful if the application wants to export the buffer to another API that requires the DRM format modifier +/// - The application can query the chosen DRM format modifier for the buffer via zeMemGetAllocProperties API +typedef struct _ze_intel_mem_selected_format_modifier_exp_properties_t { + ze_structure_type_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + uint64_t drmFormatModifier; ///< [out] DRM format modifier +} ze_intel_mem_selected_format_modifier_exp_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query for supported DRM format modifiers for a given image descriptor +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - This function can be used to query supported DRM format modifiers for a specific image description. +/// - User can use this API in two ways: +/// 1. Set pCount to the address of a uint32_t with value 0 and pDrmFormatModifiers to nullptr +/// to query just the number of supported DRM format modifiers. +/// 2. Set pCount to the address of a uint32_t with the number of elements in the pDrmFormatModifiers +/// array to retrieve the list of supported DRM format modifiers. +/// - The application can use the returned DRM format modifiers to: +/// 1. Create L0 images with supported DRM format modifiers. +/// 2. Compare with DRM format modifiers from other APIs (like Vulkan) to find common +/// modifiers that work for interop scenarios. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hDevice` +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == pCount` +/// - ::ZE_RESULT_ERROR_INVALID_IMAGE_DESC +/// + The image description doesn't match the device capabilities +ze_result_t ZE_APICALL +zeIntelImageGetFormatModifiersSupportedExp( + ze_device_handle_t hDevice, ///< [in] handle of the device + const ze_image_desc_t *pImageDesc, ///< [in] pointer to image descriptor + uint32_t *pCount, ///< [in,out] pointer to the number of DRM format modifiers. + ///< if count is zero, then the driver shall update the value with the + ///< total number of supported DRM format modifiers for the image format. + ///< if count is greater than the number of supported DRM format modifiers, + ///< then the driver shall update the value with the correct number of supported DRM format modifiers. + uint64_t *pDrmFormatModifiers ///< [in,out][optional][range(0, *pCount)] array of supported DRM format modifiers +); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query for supported DRM format modifiers for a memory allocation descriptor +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - This function can be used to query supported DRM format modifiers for a specific memory allocation description. +/// - User can use this API in two ways: +/// 1. Set pCount to the address of a uint32_t with value 0 and pDrmFormatModifiers to nullptr +/// to query just the number of supported DRM format modifiers. +/// 2. Set pCount to the address of a uint32_t with the number of elements in the pDrmFormatModifiers +/// array to retrieve the list of supported DRM format modifiers. +/// - The application can use the returned DRM format modifiers to: +/// 1. Create L0 memory allocations with supported DRM format modifiers. +/// 2. Compare with DRM format modifiers from other APIs (like Vulkan) to find common +/// modifiers that work for interop scenarios. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hContext` +ze_result_t ZE_APICALL +zeIntelMemGetFormatModifiersSupportedExp( + ze_context_handle_t hContext, ///< [in] handle of the context + const ze_device_mem_alloc_desc_t *pDeviceDesc, ///< [in] pointer to device memory allocation descriptor + size_t size, ///< [in] size in bytes to allocate + size_t alignment, ///< [in] minimum alignment in bytes for the allocation + ze_device_handle_t hDevice, ///< [in] handle of the device + uint32_t *pCount, ///< [in,out] pointer to the number of DRM format modifiers. + ///< if count is zero, then the driver shall update the value with the + ///< total number of supported DRM format modifiers for the memory allocation. + ///< if count is greater than the number of supported DRM format modifiers, + ///< then the driver shall update the value with the correct number of supported DRM format modifiers. + uint64_t *pDrmFormatModifiers ///< [in,out][optional][range(0, *pCount)] array of supported DRM format modifiers +); + +/// @brief Get priority levels +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returns priority levels supported by the device +/// - lowestPriority reports the numerical value that corresponds to lowest queue priority +/// - highesPriority reports the numerical value that corresponds to highest queue priority +/// - Lower numbers indicate greater priorities +/// - The range of meaningful queue properties is represented by [*highestPriority, *lowestPriority] +/// - Priority passed upon queue creation would automatically clamp down or up to the nearest supported value +/// - 0 means default priority +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +ze_result_t ZE_APICALL zeDeviceGetPriorityLevels( + ze_device_handle_t hDevice, + int32_t *lowestPriority, + int32_t *highestPriority); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Descriptor used for setting priority on command queues and immediate command lists. +/// This structure may be passed as pNext member of ::ze_command_queue_desc_t. +typedef struct _ze_queue_priority_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific structure + int priority; ///< [in] priority of the queue +} ze_queue_priority_desc_t; + +/// @brief Get default context associated with default driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Default context contains all devices within default driver instance +/// @returns +/// - Context handle associated with default driver +ZE_APIEXPORT ze_context_handle_t ZE_APICALL zerGetDefaultContext(); + +/// @brief Get Device Identifier +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returned identifier is a 32-bit unsigned integer that is unique to the driver. +/// - The identifier can be used then in zerTranslateIdentifierToDeviceHandle to get the device handle. +/// @returns +/// - 32-bit unsigned integer identifier +ZE_APIEXPORT uint32_t ZE_APICALL zerTranslateDeviceHandleToIdentifier(ze_device_handle_t hDevice); ///< [in] handle of the device + +/// @brief Translate Device Identifier to Device Handle from default Driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Returned device is associated to default driver handle. +/// @returns +/// - device handle associated with the identifier +ZE_APIEXPORT ze_device_handle_t ZE_APICALL zerTranslateIdentifierToDeviceHandle(uint32_t identifier); ///< [in] integer identifier of the device + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Retrieves a string describing the last error code returned by the +/// default driver in the current thread. +/// +/// @details +/// - String returned is thread local. +/// - String is only updated on calls returning an error, i.e., not on calls +/// returning ::ZE_RESULT_SUCCESS. +/// - String may be empty if driver considers error code is already explicit +/// enough to describe cause. +/// - Memory pointed to by ppString is owned by the driver. +/// - String returned is null-terminated. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_UNINITIALIZED +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == ppString` +ZE_APIEXPORT ze_result_t ZE_APICALL +zerGetLastErrorDescription( + const char **ppString ///< [in,out] pointer to a null-terminated array of characters describing + ///< cause of error. +); + +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +/// @brief Get default context associated with driver +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Default context contains all devices within driver instance +/// @returns +/// - Context handle associated with driver +ze_context_handle_t ZE_APICALL zeDriverGetDefaultContext(ze_driver_handle_t hDriver); ///> [in] handle of the driver + +/// @brief Global device synchronization +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Ensures that everything that was submitted to the device is completed. +/// - Ensures that all submissions in all queues on device are completed. +/// - It is not allowed to call this function while some command list are in graph capture mode. +/// - Returns error if error is detected during execution on device. +/// - Hangs indefinitely if GPU execution is blocked on non signaled event. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +ze_result_t ZE_APICALL zeDeviceSynchronize(ze_device_handle_t hDevice); ///> [in] handle of the device + +/// @brief Append with arguments +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Appends kernel to command list with arguments. +/// - Kernel object state is updated with new arguments, as if separate zeKernelSetArgumentValue were called. +/// - If argument is SLM (size), then SLM size in bytes for this resource is provided under pointer on specific index and its type is size_t. +/// - If argument is an immediate type (i.e. structure, non pointer type), then values under pointer must contain full size of immediate type. +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hCommandList` +/// + `nullptr == hKernel` +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == pArguments` +/// - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT +/// - ::ZE_RESULT_ERROR_INVALID_SIZE +/// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` +typedef struct _ze_group_size_t { + uint32_t groupSizeX; ///< [in] local work-group size in X dimension + uint32_t groupSizeY; ///< [in] local work-group size in Y dimension + uint32_t groupSizeZ; ///< [in] local work-group size in Z dimension + +} ze_group_size_t; + +ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithArguments( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t groupCounts, ///< [in] thread group counts + const ze_group_size_t groupSizes, ///< [in] thread group sizes + void **pArguments, ///< [in] kernel arguments; pointer to list where each argument represents a pointer to the argument value on specific index + const void *pNext, ///< [in][optional] extensions + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching + ze_event_handle_t *phWaitEvents); ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extension descriptor for cooperative kernel launch via pNext chain. +/// @details +/// - This structure can be passed through pNext to zeCommandListAppendLaunchKernelWithParameters +typedef struct _ze_command_list_append_launch_kernel_param_cooperative_desc_t { + ze_structure_type_ext_t stype; ///< [in] Type of this structure (e.g. ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC) + const void *pNext; ///< [in][optional] Pointer to the next extension-specific structure + ze_bool_t isCooperative; ///< [in] Indicates if the kernel should be launched as cooperative +} ze_command_list_append_launch_kernel_param_cooperative_desc_t; +/// @brief Append with parameters +/// +/// @details +/// - The application may call this function from simultaneous threads. +/// - The implementation of this function should be lock-free. +/// - Appends kernel to command list with additional parameters via pNext chain. +/// - Allows passing core and extension descriptors (e.g. cooperative kernel). +/// +/// @returns +/// - ::ZE_RESULT_SUCCESS +/// - ::ZE_RESULT_ERROR_DEVICE_LOST +/// - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY +/// - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE +/// - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `nullptr == hCommandList` +/// + `nullptr == hKernel` +/// - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER +/// + `nullptr == pGroupCounts` +/// - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT +/// - ::ZE_RESULT_ERROR_INVALID_SIZE +/// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` +ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelWithParameters( + ze_command_list_handle_t hCommandList, ///< [in] handle of the command list + ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object + const ze_group_count_t *pGroupCounts, ///< [in] thread group launch arguments + const void *pNext, ///< [in][optional] additional parameters (pNext chain) + ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion + uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching + ze_event_handle_t *phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait on before launching +); + +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +#if defined(__cplusplus) +} // extern "C" +#endif + +static const ze_device_mem_alloc_desc_t defaultIntelDeviceMemDesc = { + ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, // stype + nullptr, // pNext + ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED, // flags + 0 // ordinal +}; + +static const ze_host_mem_alloc_desc_t defaultIntelHostMemDesc = { + ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, // stype + nullptr, // pNext + ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED | ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT // flags +}; + +static const ze_command_queue_desc_t defaultIntelCommandQueueDesc = { + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, // stype + nullptr, // pNext + 0, // ordinal + 0, // index + ZE_COMMAND_QUEUE_FLAG_IN_ORDER | ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT, // flags + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, // mode + ZE_COMMAND_QUEUE_PRIORITY_NORMAL // priority +}; + +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME +/// @brief External Memory Mapping Extension Name +#define ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME "ZE_extension_external_memmap_sysmem" + +/////////////////////////////////////////////////////////////////////////////// +/// @brief External Memory Mapping Extension Version(s) +typedef enum _ze_external_memmap_sysmem_ext_version_t { + ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_FORCE_UINT32 = 0x7fffffff ///< Value marking end of ZE_EXTERNAL_MEMMAP_SYSMEM_EXT_VERSION_* ENUMs + +} ze_external_memmap_sysmem_ext_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Maps external system memory for an allocation +/// +/// @details +/// - This structure may be passed to ::zeMemAllocHost, via the `pNext` +/// member of ::ze_host_mem_alloc_desc_t to map system memory for a host +/// allocation. +/// - The system memory pointer and size being mapped must be page aligned +/// based on the supported page sizes on the device. +typedef struct _ze_external_memmap_sysmem_ext_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + const void *pSystemMemory; ///< [in] system memory pointer to map; must be page-aligned. + const uint64_t size; ///< [in] size of the system memory to map; must be page-aligned. + +} ze_external_memmap_sysmem_ext_desc_t; +#endif // ZE_EXTERNAL_MEMORY_MAPPING_EXT_NAME + +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) +#endif diff --git a/thirdparty/level_zero/compute-runtime/ze_stypes.h b/thirdparty/level_zero/compute-runtime/ze_stypes.h new file mode 100644 index 00000000000000..50c9dbaa4e35c2 --- /dev/null +++ b/thirdparty/level_zero/compute-runtime/ze_stypes.h @@ -0,0 +1,65 @@ +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac +/* + * Copyright (C) 2024-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZE_STYPES_H +#define _ZE_STYPES_H + +#include +#include +#include + +#include +using ze_structure_type_ext_t = uint32_t; +using zet_structure_type_ext_t = uint32_t; +using zes_structure_type_ext_t = uint32_t; + +#define ZE_STRUCTURE_TYPE_SYNCHRONIZED_DISPATCH_EXP_DESC static_cast(0x00020020) +#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_COMMUNICATION_DESC static_cast(0x00020021) +#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_DOORBELL_HANDLE_DESC static_cast(0x00020022) +#define ZE_STRUCTURE_TYPE_INTEL_DEVICE_MEDIA_EXP_PROPERTIES static_cast(0x00020023) +#if ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) +#define ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC static_cast(0x00020036) +#define ZE_STRUCTURE_TYPE_EXTERNAL_MEMMAP_SYSMEM_EXT_DESC static_cast(0x00020037) +#endif // ZE_API_VERSION_CURRENT_M <= ZE_MAKE_VERSION(1, 13) +#define ZEX_STRUCTURE_TYPE_LABEL_DESCRIPTOR static_cast(0x00030002) +#define ZEX_STRUCTURE_TYPE_OPERAND_DESCRIPTOR static_cast(0x00030003) +#define ZEX_STRUCTURE_TYPE_VARIABLE_DESCRIPTOR static_cast(0x00030004) +#define ZEX_STRUCTURE_TYPE_TEMP_VARIABLE_DESCRIPTOR static_cast(0x00030005) +#define ZEX_STRUCTURE_TYPE_VARIABLE_INFO static_cast(0x00030006) +#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES static_cast(0x00030007) +#define ZEX_STRUCTURE_DEVICE_MODULE_REGISTER_FILE_EXP static_cast(0x00030010) +#define ZEX_STRUCTURE_KERNEL_REGISTER_FILE_SIZE_EXP static_cast(0x00030012) +#define ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES static_cast(0x00030013) +#define ZEX_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC static_cast(0x00030016) +#define ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC static_cast(0x00030017) +#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_ALLOCATE_MSIX_HINT_EXP_PROPERTIES static_cast(0x00030018) +#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES static_cast(0x0003001B) +#define ZE_STRUCTURE_INTEL_DEVICE_MEMORY_CXL_EXP_PROPERTIES static_cast(0x00030019) +#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC static_cast(0x0003001C) +#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_SYNC_ALLOC_PROPERTIES static_cast(0x0003001D) +#define ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES static_cast(0x00030027) +#define ZE_STRUCTURE_TYPE_QUEUE_PRIORITY_DESC static_cast(0x00030028) +#ifndef ZE_RECORD_REPLAY_GRAPH_EXP_NAME +#define ZE_STRUCTURE_TYPE_RECORD_REPLAY_GRAPH_EXP_PROPERTIES static_cast(0x00030029) +#endif // ZE_RECORD_REPLAY_GRAPH_EXP_NAME +#define ZEX_STRUCTURE_TYPE_MEMORY_FREE_CALLBACK_EXT_DESC static_cast(0x00030030) + +// Metric structure types +#define ZET_STRUCTURE_TYPE_INTEL_METRIC_SCOPE_PROPERTIES_EXP static_cast(0x00010006) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_HW_BUFFER_SIZE_EXP_DESC static_cast(0x00010007) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_GROUP_CALCULATION_EXP_PROPERTIES static_cast(0x00010008) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULATION_DESC_EXP static_cast(0x00010009) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_SOURCE_ID_EXP static_cast(0x0001000a) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_DECODED_BUFFER_PROPERTIES_EXP static_cast(0x0001000b) +#define ZET_INTEL_STRUCTURE_TYPE_METRIC_CALCULABLE_PROPERTIES_EXP static_cast(0x0001000c) + +// Sysman structure types +#define ZES_INTEL_PCI_LINK_SPEED_DOWNGRADE_EXP_STATE static_cast(0x00040001) +#define ZES_INTEL_PCI_LINK_SPEED_DOWNGRADE_EXP_PROPERTIES static_cast(0x00040002) + +#endif diff --git a/thirdparty/level_zero/compute-runtime/zex_common.h b/thirdparty/level_zero/compute-runtime/zex_common.h new file mode 100644 index 00000000000000..7c8f050c38166e --- /dev/null +++ b/thirdparty/level_zero/compute-runtime/zex_common.h @@ -0,0 +1,241 @@ +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac +/* + * Copyright (C) 2022-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZEX_COMMON_H +#define _ZEX_COMMON_H +#if defined(__cplusplus) +#pragma once +#endif +#include "ze_stypes.h" +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Handle of command list object +typedef ze_command_list_handle_t zex_command_list_handle_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Handle of event object +typedef ze_event_handle_t zex_event_handle_t; + +#define ZEX_BIT(_i) (1 << _i) + +typedef uint32_t zex_mem_action_scope_flags_t; +typedef enum _zex_mem_action_scope_flag_t { + ZEX_MEM_ACTION_SCOPE_FLAG_SUBDEVICE = ZEX_BIT(0), + ZEX_MEM_ACTION_SCOPE_FLAG_DEVICE = ZEX_BIT(1), + ZEX_MEM_ACTION_SCOPE_FLAG_HOST = ZEX_BIT(2), + ZEX_MEM_ACTION_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff +} zex_mem_action_scope_flag_t; + +typedef uint32_t zex_wait_on_mem_action_flags_t; +typedef enum _zex_wait_on_mem_action_flag_t { + ZEX_WAIT_ON_MEMORY_FLAG_EQUAL = ZEX_BIT(0), + ZEX_WAIT_ON_MEMORY_FLAG_NOT_EQUAL = ZEX_BIT(1), + ZEX_WAIT_ON_MEMORY_FLAG_GREATER_THAN = ZEX_BIT(2), + ZEX_WAIT_ON_MEMORY_FLAG_GREATER_THAN_EQUAL = ZEX_BIT(3), + ZEX_WAIT_ON_MEMORY_FLAG_LESSER_THAN = ZEX_BIT(4), + ZEX_WAIT_ON_MEMORY_FLAG_LESSER_THAN_EQUAL = ZEX_BIT(5), + ZEX_WAIT_ON_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff +} zex_wait_on_mem_action_flag_t; + +typedef struct _zex_wait_on_mem_desc_t { + zex_wait_on_mem_action_flags_t actionFlag; + zex_mem_action_scope_flags_t waitScope; +} zex_wait_on_mem_desc_t; + +typedef struct _zex_write_to_mem_desc_t { + zex_mem_action_scope_flags_t writeScope; +} zex_write_to_mem_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +#ifndef ZE_SYNCHRONIZED_DISPATCH_EXP_NAME +/// @brief Synchronized Dispatch extension name +#define ZE_SYNCHRONIZED_DISPATCH_EXP_NAME "ZE_experimental_synchronized_dispatch" +#endif // ZE_SYNCHRONIZED_DISPATCH_EXP_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Synchronized Dispatch extension version(s) +typedef enum _ze_synchronized_dispatch_exp_version_t { + ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZE_SYNCHRONIZED_DISPATCH_EXP_VERSION_FORCE_UINT32 = 0x7fffffff + +} ze_synchronized_dispatch_exp_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported synchronized dispatch flags +typedef uint32_t ze_synchronized_dispatch_exp_flags_t; +typedef enum _ze_synchronized_dispatch_exp_flag_t { + ZE_SYNCHRONIZED_DISPATCH_DISABLED_EXP_FLAG = ZE_BIT(0), ///< Non-synchronized dispatch. Must synchronize only with other synchronized dispatches + ZE_SYNCHRONIZED_DISPATCH_ENABLED_EXP_FLAG = ZE_BIT(1), ///< Synchronized dispatch. Must synchronize with all synchronized and non-synchronized dispatches + ZE_SYNCHRONIZED_DISPATCH_EXP_FLAG_FORCE_UINT32 = 0x7fffffff + +} ze_synchronized_dispatch_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_synchronized_dispatch_exp_desc_t +typedef struct _ze_synchronized_dispatch_exp_desc_t ze_synchronized_dispatch_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Command queue or command list descriptor for synchronized dispatch. This structure may be +/// passed as pNext member of ::ze_command_queue_desc_t. or ::ze_command_list_desc_t. +typedef struct _ze_synchronized_dispatch_exp_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ///< structure (i.e. contains stype and pNext). + ze_synchronized_dispatch_exp_flags_t flags; ///< [in] mode flags. + ///< must be valid value of ::ze_synchronized_dispatch_exp_flag_t + +} ze_synchronized_dispatch_exp_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_intel_media_communication_desc_t +typedef struct _ze_intel_media_communication_desc_t ze_intel_media_communication_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief ze_intel_media_communication_desc_t +typedef struct _ze_intel_media_communication_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific, this will be used to extend this in future + void *controlSharedMemoryBuffer; ///< [in] control shared memory buffer pointer, must be USM address + uint32_t controlSharedMemoryBufferSize; ///< [in] control shared memory buffer size + void *controlBatchBuffer; ///< [in] control batch buffer pointer, must be USM address + uint32_t controlBatchBufferSize; ///< [in] control batch buffer size +} ze_intel_media_communication_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_intel_media_doorbell_handle_desc_t +typedef struct _ze_intel_media_doorbell_handle_desc_t ze_intel_media_doorbell_handle_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief ze_intel_media_doorbell_handle_desc_t +/// @details Handle of the doorbell. This structure is passed as argument of zeIntelMediaCommunicationCreate and zeIntelMediaCommunicationDestroy +typedef struct _ze_intel_media_doorbell_handle_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific, this will be used to extend this in future + void *doorbell; ///< [in,out] handle of the doorbell +} ze_intel_media_doorbell_handle_desc_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported device media flags +typedef uint32_t ze_intel_device_media_exp_flags_t; +typedef enum _ze_intel_device_media_exp_flag_t { + ZE_INTEL_DEVICE_MEDIA_SUPPORTS_ENCODING_EXP_FLAG = ZE_BIT(0), ///< Supports encoding + ZE_INTEL_DEVICE_MEDIA_SUPPORTS_DECODING_EXP_FLAG = ZE_BIT(1), ///< Supports decoding + ZE_INTEL_DEVICE_MEDIA_EXP_FLAG_FORCE_UINT32 = 0x7fffffff +} ze_intel_device_media_exp_flag_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Forward-declare ze_intel_device_media_exp_properties_t +typedef struct _ze_intel_device_media_exp_properties_t ze_intel_device_media_exp_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief May be passed to ze_device_properties_t through pNext. +typedef struct _ze_intel_device_media_exp_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + ze_intel_device_media_exp_flags_t flags; ///< [out] device media flags + uint32_t numEncoderCores; ///< [out] number of encoder cores + uint32_t numDecoderCores; ///< [out] number of decoder cores +} ze_intel_device_media_exp_properties_t; + +#ifndef ZEX_COUNTER_BASED_EVENT_EXT_NAME +/// @brief Counter Based Event Extension Name +#define ZEX_COUNTER_BASED_EVENT_EXT_NAME "ZEX_counter_based_event" +#endif // ZEX_COUNTER_BASED_EVENT_EXT_NAME + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Counter Based Event Extension Version(s) +typedef enum _zex_counter_based_event_version_t { + ZEX_COUNTER_BASED_EVENT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 + ZEX_COUNTER_BASED_EVENT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version + ZEX_COUNTER_BASED_EVENT_VERSION_FORCE_UINT32 = 0x7fffffff + +} zex_counter_based_event_version_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief IPC handle to counter based event +typedef struct _zex_ipc_counter_based_event_handle_t { + char data[ZE_MAX_IPC_HANDLE_SIZE]; ///< [out] Opaque data representing an IPC handle +} zex_ipc_counter_based_event_handle_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Supported event flags for defining counter-based event +typedef uint32_t zex_counter_based_event_exp_flags_t; +typedef enum _zex_counter_based_event_exp_flag_t { + ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE = ZE_BIT(0), ///< Counter-based event is used for immediate command lists (default) + ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE = ZE_BIT(1), ///< Counter-based event is used for non-immediate command lists + ZEX_COUNTER_BASED_EVENT_FLAG_HOST_VISIBLE = ZE_BIT(2), ///< Signals and waits are also visible to host + ZEX_COUNTER_BASED_EVENT_FLAG_IPC = ZE_BIT(3), ///< Event can be shared across processes for waiting + ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP = ZE_BIT(4), ///< Event contains kernel timestamps + ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_MAPPED_TIMESTAMP = ZE_BIT(5), ///< Event contains kernel timestamps synchronized to host time domain. + ///< Cannot be combined with::ZEX_COUNTER_BASED_EVENT_FLAG_KERNEL_TIMESTAMP + ZEX_COUNTER_BASED_EVENT_FLAG_GRAPH_EXTERNAL_EVENT = ZE_BIT(6), ///< Event when is used in graph record & replay, can be used outside + ///< recorded graph for synchronization (using as wait event or for host synchronization) + ZEX_COUNTER_BASED_EVENT_FLAG_FORCE_UINT32 = 0x7fffffff + +} zex_counter_based_event_exp_flag_t; + +typedef struct _zex_counter_based_event_desc_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + zex_counter_based_event_exp_flags_t flags; ///< [in] counter based event flags. + ///< Must be 0 (default) or a valid combination of ::zex_counter_based_event_exp_flag_t + ze_event_scope_flags_t signalScope; ///< [in] defines the scope of relevant cache hierarchies to flush on a + ///< signal action before the event is triggered. + ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t; + ///< default behavior is synchronization within the command list only, no + ///< additional cache hierarchies are flushed. + ze_event_scope_flags_t waitScope; ///< [in] defines the scope of relevant cache hierarchies to invalidate on + ///< a wait action after the event is complete. + ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t; + ///< default behavior is synchronization within the command list only, no + ///< additional cache hierarchies are invalidated. +} zex_counter_based_event_desc_t; + +static const zex_counter_based_event_desc_t defaultIntelCounterBasedEventDesc = { + ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC, // stype + nullptr, // pNext + ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE | + ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE | + ZEX_COUNTER_BASED_EVENT_FLAG_HOST_VISIBLE, // flags + ZE_EVENT_SCOPE_FLAG_HOST, // signalScope + ZE_EVENT_SCOPE_FLAG_DEVICE // waitScope +}; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Initial Counter Based Event synchronization parameters. This structure may be +/// passed as pNext member of ::zex_counter_based_event_desc_t. +typedef struct _zex_counter_based_event_external_sync_alloc_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + uint64_t *deviceAddress; ///< [in] device address for external synchronization allocation + uint64_t *hostAddress; ///< [in] host address for external synchronization allocation + uint64_t completionValue; ///< [in] completion value for external synchronization allocation +} zex_counter_based_event_external_sync_alloc_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Initial Counter Based Event synchronization parameters. This structure may be +/// passed as pNext member of ::zex_counter_based_event_desc_t. +typedef struct _zex_counter_based_event_external_storage_properties_t { + ze_structure_type_ext_t stype; ///< [in] type of this structure + const void *pNext; ///< [in][optional] must be null or a pointer to an extension-specific + uint64_t *deviceAddress; ///< [in] device address that would be updated with atomic_add upon signaling of this event, must be device USM memory + uint64_t incrementValue; ///< [in] value which would by atomically added upon each completion + uint64_t completionValue; ///< [in] final completion value, when value under deviceAddress is equal or greater then this value then event is considered as completed +} zex_counter_based_event_external_storage_properties_t; + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif // _ZEX_COMMON_EXTENDED_H diff --git a/thirdparty/level_zero/compute-runtime/zex_event.h b/thirdparty/level_zero/compute-runtime/zex_event.h new file mode 100644 index 00000000000000..ec38aaeba9bd80 --- /dev/null +++ b/thirdparty/level_zero/compute-runtime/zex_event.h @@ -0,0 +1,58 @@ +// intel/compute-runtime 4df478c5139703c82e548a65eafbcc69923953ac +/* + * Copyright (C) 2023-2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#ifndef _ZEX_EVENT_H +#define _ZEX_EVENT_H +#if defined(__cplusplus) +#pragma once +#endif + +#include + +#include "zex_common.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +ZE_APIEXPORT ze_result_t ZE_APICALL +zexEventGetDeviceAddress( + ze_event_handle_t event, + uint64_t *completionValue, + uint64_t *address); + +// deprecated +ZE_APIEXPORT ze_result_t ZE_APICALL +zexCounterBasedEventCreate( + ze_context_handle_t hContext, + ze_device_handle_t hDevice, + uint64_t *deviceAddress, + uint64_t *hostAddress, + uint64_t completionValue, + const ze_event_desc_t *desc, + ze_event_handle_t *phEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexIntelAllocateNetworkInterrupt(ze_context_handle_t hContext, uint32_t &networkInterruptId); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexIntelReleaseNetworkInterrupt(ze_context_handle_t hContext, uint32_t networkInterruptId); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDevice, const zex_counter_based_event_desc_t *desc, ze_event_handle_t *phEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventGetIpcHandle(ze_event_handle_t hEvent, zex_ipc_counter_based_event_handle_t *phIpc); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventOpenIpcHandle(ze_context_handle_t hContext, zex_ipc_counter_based_event_handle_t hIpc, ze_event_handle_t *phEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexCounterBasedEventCloseIpcHandle(ze_event_handle_t hEvent); + +ZE_APIEXPORT ze_result_t ZE_APICALL zexDeviceGetAggregatedCopyOffloadIncrementValue(ze_device_handle_t hDevice, uint32_t *incrementValue); + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif // _ZEX_EVENT_H