diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index 01acdac91682e3..b2d6422aee3908 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -65,6 +65,9 @@ struct kernel_impl_params { std::vector output_size; std::vector img_size; + std::map in_port_to_shape_info_offset = {}; + std::map out_port_to_shape_info_offset = {}; + kernel_impl_params() : prog(nullptr), dev_type(cldnn::device_type::integrated_gpu), strm(nullptr), desc(nullptr), unique_id(0) { } diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 07927abe52a107..5fe0250d253ee7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -214,9 +214,9 @@ struct network { return *_memory_pool; } - void set_variable(const std::string& name, const std::shared_ptr& variable); + void set_variable(const std::string& name, const std::shared_ptr& variable); bool has_variable(const std::string &variable_id) const; - ov::intel_gpu::VariableState& get_variable(const std::string &variable_id) const; + ov::intel_gpu::VariableStateBase& get_variable(const std::string &variable_id) const; const ov::intel_gpu::VariableStateInfo& get_variable_info(const std::string &variable_id) const; const ov::intel_gpu::VariablesMap& get_variables() const; const ov::intel_gpu::VariablesInfoMap& get_variables_info() const; @@ -279,7 +279,7 @@ struct network { void add_default_output_chains(); void calculate_weights_cache_capacity(); output_chains_map::iterator add_output_chain(std::shared_ptr& p_inst); - void set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type); + void set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type, const primitive* p); #ifdef GPU_DEBUG_CONFIG int64_t iteration = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp index 3391cf89976c2b..80f0dec5ddb7b9 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/gemm.hpp @@ -37,6 +37,12 @@ class Gemm : public ov::op::v0::MatMul { std::vector get_output_order() const { return m_order_c; } ov::element::Type get_output_type() const { return m_output_type; } + static std::vector default_order(size_t rank) { + std::vector order(rank); + std::iota(order.begin(), order.end(), 0); + return order; + } + protected: std::vector m_order_a; std::vector m_order_b; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_gemm.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_gemm.hpp new file mode 100644 index 00000000000000..ef51a9fb11d7f4 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_gemm.hpp @@ -0,0 +1,51 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/op/gemm.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/op/op.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +class IndirectGemm : public ov::intel_gpu::op::Gemm { +public: + OPENVINO_OP("IndirectGemm", "gpu_opset"); + + IndirectGemm() = default; + + IndirectGemm(const ov::Output& A, + const ov::Output& B, + const ov::Output& I, + bool indirect_a, + bool indirect_b, + const std::vector& order_a, + const std::vector& order_b, + const std::vector& order_c, + const ov::element::Type output_type = ov::element::undefined); + + bool visit_attributes(ov::AttributeVisitor &visitor) override; + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + ov::element::Type get_output_type() const { return m_output_type; } + + bool get_indirect_a() const { return m_indirect_a; } + bool get_indirect_b() const { return m_indirect_b; } + + using ov::intel_gpu::op::Gemm::default_order; + +protected: + bool m_indirect_a = false; + bool m_indirect_b = false; +}; + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 9f24c4b5f52010..ab6108e63f1464 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -51,9 +51,12 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { void set_concat_axis(int64_t axis) { m_concat_axis = axis; } void set_gather_axis(int64_t axis) { m_gather_axis = axis; } + bool get_indirect() const { return m_indirect; } + private: int64_t m_concat_axis; int64_t m_gather_axis; + bool m_indirect = false; ov::element::Type m_output_type; }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp new file mode 100644 index 00000000000000..0cad36f62e47b9 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "intel_gpu/plugin/variable_state.hpp" +#include "openvino/core/partial_shape.hpp" + +namespace ov { +namespace intel_gpu { + +class MultiTensorState : public VariableStateBase { +public: + MultiTensorState(const std::vector& infos, std::shared_ptr context, ShapePredictor::Ptr shape_predictor); + +protected: + std::vector> m_hidden_states = {}; +}; + +// This is multi-tensor state for Indirect KV-Cache + Gemm pattern +// Internally it stores KV Cache state + Beam Table state +class VariableStateIndirectKVCache : public MultiTensorState { +public: + VariableStateIndirectKVCache(const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + size_t beam_idx, + size_t concat_idx); + using Ptr = std::shared_ptr; + + void reset() override; + void set_state(const ov::SoPtr& state) override; + ov::SoPtr get_state() const override; + + cldnn::memory::ptr get_memory() const override; + const cldnn::layout& get_layout() const override; + void set_layout(const cldnn::layout& new_layout) override; + void set_memory(const cldnn::memory::ptr& new_mem, const cldnn::layout& actual_layout) override; + size_t get_actual_mem_size() const override; + + VariableState::Ptr get_beam_table_state() const; + ov::PartialShape get_beam_table_shape(const ov::PartialShape& kv_cache_shape); + +private: + size_t m_beam_axis = 0; + size_t m_concat_axis = 0; +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index af0497c0e9d5d4..9ddfa9e15ea836 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -277,3 +277,4 @@ REGISTER_FACTORY(internal, KVCache); REGISTER_FACTORY(internal, ReadValue); REGISTER_FACTORY(internal, Gemm); REGISTER_FACTORY(internal, SwiGLU); +REGISTER_FACTORY(internal, IndirectGemm); diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp index 97a6d068a7c687..eb3697e4f3e4ab 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp @@ -89,7 +89,7 @@ class SyncInferRequest : public ov::ISyncInferRequest { bool m_enable_profiling = false; bool m_use_external_queue = false; - void prepare_state(const std::string& name, const VariableState::Ptr variable); + void prepare_state(const std::string& name, const std::shared_ptr& variable); std::vector prepare_input(const std::string& name, const ov::Output& port, const TensorWrapper& user_tensor_wrapper); std::vector prepare_output(const std::string& name, const ov::Output& port, const TensorWrapper& user_tensor_wrapper); std::vector prepare_batched_input(const std::string& name, diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp index 0bacf2ec9a00a8..d59e861437f64a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp @@ -19,48 +19,69 @@ struct VariableStateInfo { VariableStateInfo(const std::string& id, const cldnn::layout& layout, ov::element::Type_t user_specified_type = ov::element::undefined) : m_id(id) , m_layout(layout) - , m_user_specified_type(user_specified_type) {} + , m_user_specified_type(user_specified_type) + , m_primitives() {} std::string m_id; cldnn::layout m_layout; ov::element::Type m_user_specified_type; + std::set m_primitives; }; -class VariableState : public ov::IVariableState { +class VariableStateBase : public ov::IVariableState { public: - VariableState(const VariableStateInfo& info, std::shared_ptr context, std::shared_ptr shape_predictor); + VariableStateBase(const std::string& id, std::shared_ptr context) : ov::IVariableState(id), m_context(context) {} + virtual cldnn::memory::ptr get_memory() const = 0; + virtual const cldnn::layout& get_layout() const = 0; + virtual void set_layout(const cldnn::layout& new_layout) = 0; + virtual void set_memory(const cldnn::memory::ptr& new_mem, const cldnn::layout& actual_layout) = 0; + virtual size_t get_actual_mem_size() const = 0; + + void set() { m_is_set = true; } + bool is_set() const { return m_is_set; } + +protected: + bool m_is_set = false; + std::shared_ptr m_context; +}; + +class VariableState : public VariableStateBase { +public: + VariableState(const VariableStateInfo& info, std::shared_ptr context, ShapePredictor::Ptr shape_predictor); using Ptr = std::shared_ptr; void reset() override; void set_state(const ov::SoPtr& state) override; ov::SoPtr get_state() const override; - cldnn::memory::ptr get_memory() const; - const cldnn::layout& get_layout() const; - bool is_set() const; - void set(); - void set_layout(const cldnn::layout& new_layout); - void set_memory(const cldnn::memory::ptr& new_mem, const cldnn::layout& actual_layout); - size_t get_actual_mem_size() const { + cldnn::memory::ptr get_memory() const override; + const cldnn::layout& get_layout() const override; + + void set_layout(const cldnn::layout& new_layout) override; + void set_memory(const cldnn::memory::ptr& new_mem, const cldnn::layout& actual_layout) override; + size_t get_actual_mem_size() const override { return actual_size; } -private: + const cldnn::layout& get_initial_layout() const { + return m_initial_layout; + } + + ov::element::Type get_user_specified_type() const; + +protected: cldnn::layout m_layout; ov::element::Type m_user_specified_type; - std::shared_ptr m_context; std::shared_ptr m_shape_predictor; - bool m_is_set = false; cldnn::memory::ptr m_memory = nullptr; size_t actual_size = 0; const cldnn::layout m_initial_layout; void update_device_buffer(); - ov::element::Type get_user_specified_type() const; }; -using VariablesMap = std::unordered_map; +using VariablesMap = std::unordered_map>; using VariablesInfoMap = std::unordered_map; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp index 20a42ab91b5d50..8ee341586b6106 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/gemm.hpp @@ -104,20 +104,36 @@ struct gemm : public primitive_base { throw std::invalid_argument("Invalid inputs count - gemm expects either two or three inputs"); } - auto get_transpose_mode = [](const std::vector& order_idx) { - int64_t rank = order_idx.size() - 1; - - if (rank == order_idx[rank]) { - // normal - return TransposeType::X_LAST; - } else if (rank == order_idx[rank - 1]) { - // the second last dim is moved to the last - return TransposeType::Y_LAST; - } else { - // other - return TransposeType::OTHER; - } - }; + transpose_input0 = get_transpose_mode(input0_order); + transpose_input1 = get_transpose_mode(input1_order); + } + + gemm(const primitive_id& id, + const std::vector& inputs, + const input_info& beam_table, + const data_types data_type, + const std::vector& input0_order, + const std::vector& input1_order, + const std::vector& output_order, + bool indirect_a, + bool indirect_b, + const float alpha = 1.0f, + const float beta = 0.0f, + const padding& output_padding = padding()) + : primitive_base(id, inputs, {output_padding}, {optional_data_type{ data_type }}), + input0_order(input0_order), + input1_order(input1_order), + output_order(output_order), + alpha(alpha), + beta(beta), + input_rank(input0_order.size()), + weight_rank(input1_order.size()), + beam_table(beam_table), + indirect_a(indirect_a), + indirect_b(indirect_b) { + if (inputs.size() != 2 && inputs.size() != 3) { + throw std::invalid_argument("Invalid inputs count - gemm expects either two or three inputs"); + } transpose_input0 = get_transpose_mode(input0_order); transpose_input1 = get_transpose_mode(input1_order); @@ -142,10 +158,17 @@ struct gemm : public primitive_base { /// @brief Second matrix rank size_t weight_rank = 4; + /// @brief Beam table input for indirect access for one of the inputs + input_info beam_table = {}; + bool indirect_a = false; + bool indirect_b = false; + size_t hash() const override { size_t seed = primitive::hash(); seed = hash_combine(seed, transpose_input0); seed = hash_combine(seed, transpose_input1); + seed = hash_combine(seed, indirect_a); + seed = hash_combine(seed, indirect_b); for (auto order : input0_order) seed = hash_combine(seed, order); for (auto order : input1_order) @@ -167,6 +190,8 @@ struct gemm : public primitive_base { transpose_input1 == rhs_casted.transpose_input1 && alpha == rhs_casted.alpha && beta == rhs_casted.beta && + indirect_a == rhs_casted.indirect_a && + indirect_b == rhs_casted.indirect_b && input_rank == rhs_casted.input_rank && weight_rank == rhs_casted.weight_rank; } @@ -182,6 +207,10 @@ struct gemm : public primitive_base { ob << beta; ob << input_rank; ob << weight_rank; + ob << indirect_a; + ob << indirect_b; + ob << beam_table.pid; + ob << beam_table.idx; } void load(BinaryInputBuffer& ib) override { @@ -195,6 +224,32 @@ struct gemm : public primitive_base { ib >> beta; ib >> input_rank; ib >> weight_rank; + ib >> indirect_a; + ib >> indirect_b; + ib >> beam_table.pid; + ib >> beam_table.idx; + } + + std::vector get_dependencies() const override { + if (beam_table.is_valid()) + return { beam_table }; + return {}; + } + +private: + TransposeType get_transpose_mode(const std::vector& order_idx) { + int64_t rank = order_idx.size() - 1; + + if (rank == order_idx[rank]) { + // normal + return TransposeType::X_LAST; + } else if (rank == order_idx[rank - 1]) { + // the second last dim is moved to the last + return TransposeType::Y_LAST; + } else { + // other + return TransposeType::OTHER; + } } }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp index c5d384513e098f..048f51379c30ec 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp @@ -21,17 +21,24 @@ struct kv_cache : public primitive_base { const ov::op::util::VariableInfo& variable_info, const int64_t concat_axis, const int64_t gather_axis, + const bool indirect, const padding& output_padding = padding()) - : primitive_base(id, inputs, {output_padding}), variable_info(variable_info), concat_axis(concat_axis), gather_axis(gather_axis) {} + : primitive_base(id, inputs, {output_padding}) + , variable_info(variable_info) + , concat_axis(concat_axis) + , gather_axis(gather_axis) + , indirect(indirect) {} ov::op::util::VariableInfo variable_info; int64_t concat_axis = 0; int64_t gather_axis = 0; + bool indirect = false; size_t hash() const override { size_t seed = primitive::hash(); seed = hash_combine(seed, concat_axis); seed = hash_combine(seed, gather_axis); + seed = hash_combine(seed, indirect); return seed; } @@ -43,7 +50,8 @@ struct kv_cache : public primitive_base { return variable_info == rhs_casted.variable_info && concat_axis == rhs_casted.concat_axis && - gather_axis == rhs_casted.gather_axis; + gather_axis == rhs_casted.gather_axis && + indirect == rhs_casted.indirect; } void save(BinaryOutputBuffer& ob) const override { @@ -54,6 +62,7 @@ struct kv_cache : public primitive_base { ob << make_data(&data_type, sizeof(ov::element::Type_t)); ob << concat_axis; ob << gather_axis; + ob << indirect; } void load(BinaryInputBuffer& ib) override { @@ -67,6 +76,7 @@ struct kv_cache : public primitive_base { variable_info = { data_shape, data_type, variable_id }; ib >> concat_axis; ib >> gather_axis; + ib >> indirect; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/shape_predictor.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/shape_predictor.hpp index aea07971ca6020..01ae5e1a5b62b0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/shape_predictor.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/shape_predictor.hpp @@ -14,6 +14,7 @@ class engine; struct ShapePredictor { public: + using Ptr = std::shared_ptr; ShapePredictor(const engine* engine, float buffers_preallocation_ratio) : _engine(engine) , _buffers_preallocation_ratio(buffers_preallocation_ratio) { @@ -77,3 +78,9 @@ struct ShapePredictor { }; } // namespace cldnn + +namespace ov { +namespace intel_gpu { +using ShapePredictor = cldnn::ShapePredictor; +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index 0b30b6a4b821de..24df1920f0e595 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -253,16 +253,21 @@ std::string gemm_inst::to_string(gemm_node const& node) { auto beta = desc->beta; auto transpose_input0 = desc->transpose_input0 ? " true" : "false"; auto transpose_input1 = desc->transpose_input1 ? " true" : "false"; + auto indirect_input0 = desc->indirect_a ? " true" : "false"; + auto indirect_input1 = desc->indirect_b ? " true" : "false"; std::stringstream primitive_description; json_composite gemm_info; for (size_t i = 0; i < node.get_inputs_count(); i++) { gemm_info.add("input_" + std::to_string(i), node.input(i).id()); } + gemm_info.add("beam_table", (desc->beam_table.is_valid() ? desc->beam_table.pid : "N/A")); gemm_info.add("alpha", alpha); gemm_info.add("beta", beta); gemm_info.add("trasnpose_input0", transpose_input0); gemm_info.add("transpose_input1", transpose_input1); + gemm_info.add("indirect_input0", indirect_input0); + gemm_info.add("indirect_input1", indirect_input1); node_info->add("gemm info", gemm_info); node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp index b0ea6ddaebebdf..0ce679db7987e6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gemm.cpp @@ -2,8 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "primitive_base.hpp" +#include "intel_gpu/graph/kernel_impl_params.hpp" +#include "multi_stage_primitive.hpp" +#include "kv_cache_inst.h" #include "gemm_inst.h" #include "gemm/gemm_kernel_base.h" #include "gemm/gemm_kernel_selector.h" @@ -11,14 +13,22 @@ namespace cldnn { namespace ocl { -struct gemm_impl : typed_primitive_impl_ocl { - using parent = typed_primitive_impl_ocl; +// Gemm impl may create 2 versions of the kernel internally +// 1. default kernel +// 2. kernel with indirect access to one of the inputs +// This feature is used to avoid perf drop when we create single kernel which checks batch size in runtime +// Can be reverted once performance of the kernel is improved +struct gemm_impl : multi_stage_primitive { + using parent = multi_stage_primitive; using parent::parent; using kernel_selector_t = kernel_selector::gemm_kernel_selector; using kernel_params_t = std::pair; DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::gemm_impl) + const uint32_t default_gemm = 0; + const uint32_t indirect_gemm = 1; + std::unique_ptr clone() const override { return make_unique(*this); } @@ -27,13 +37,122 @@ struct gemm_impl : typed_primitive_impl_ocl { parent::load(ib); if (is_dynamic()) { auto& kernel_selector = kernel_selector_t::Instance(); - auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName); - kernel_impl->GetUpdateDispatchDataFunc(_kernel_data); + auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[default_gemm].kernelName); + kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[default_gemm]); + if (_kernels_data.size() == 2) { + auto bt_kernel_impl = kernel_selector.GetImplementation(_kernels_data[indirect_gemm].kernelName); + bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[indirect_gemm]); + } + } + } + +protected: + static size_t get_beam_table_id(std::shared_ptr primitive) { + return primitive->input_size() == 3 ? 3 : 2; + } + + kernel_arguments_data get_arguments(const gemm_inst& instance, size_t stage) const override { + kernel_arguments_data args; + + for (size_t i = 0; i < instance.inputs_memory_count(); i++) { + args.inputs.push_back(instance.input_memory_ptr(i)); + } + + if (instance.has_fused_primitives()) { + size_t count = instance.get_fused_mem_count(); + for (size_t i = 0; i < count; i++) { + args.fused_op_inputs.push_back(instance.fused_memory(i)); + } + } + + for (size_t i = 0; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } + + args.shape_info = instance.shape_info_memory_ptr(); + + const auto& desc = instance.get_typed_desc(); + if (stage == indirect_gemm) { + args.inputs.push_back(instance.dep_memory_ptr(get_beam_table_id(desc))); } + + return args; + } + + void set_arguments_impl(gemm_inst& instance) override {} + + event::ptr execute_stage(const std::vector& events, gemm_inst& instance, size_t stage) { + stream& stream = instance.get_network().get_stream(); + std::vector tmp_events(events); + std::vector all_events; + size_t kernel_offset = 0; + for (size_t s = 0; s < stage; s++) { + kernel_offset += _kernels_data[s].kernels.size(); + } + for (size_t kd_idx = 0; kd_idx < _kernels_data[stage].kernels.size(); ++kd_idx) { + if (_kernels_data[stage].kernels[kd_idx].skip_execution) + continue; + + size_t idx_final = kernel_offset + kd_idx; + // If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr) + bool needs_completion_event = instance.needs_completion_event(); + + auto& params = _kernels_data[stage].kernels[kd_idx].params; + auto args = get_arguments(instance, stage); + args.scalars = ¶ms.scalars; + + for (const auto& m : instance.get_intermediates_memories()) { + args.intermediates.push_back(m); + } + + stream.set_arguments(*_kernels[idx_final], _kernels_data[stage].kernels[kd_idx].params, args); + + const auto& gws = params.workGroups.global; + const auto& lws = params.workGroups.local; + + GPU_DEBUG_TRACE_DETAIL << "Enqueue stage " << stage << " kernel " << idx_final << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] " + << "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]" + << (needs_completion_event ? " has_completion_event=true" : "") << std::endl; + + auto ev = stream.enqueue_kernel(*_kernels[idx_final], params, args, tmp_events, needs_completion_event); + if (_kernels_data[stage].needs_sub_kernels_sync) { + tmp_events = {ev}; + } + all_events.push_back(ev); + } + + return aggregate_events(all_events, stream, all_events.size() > 1); + } + + bool need_indirect_load(const gemm_inst& inst) const { + auto desc = inst.get_typed_desc(); + if (!desc->indirect_a && !desc->indirect_b) + return false; + + const auto& params = *inst.get_impl_params(); + if (params.input_layouts[get_beam_table_id(desc)].get_partial_shape()[0].get_length() == 1) + return false; + + const auto& deps = inst.dependencies(); + + const auto& indirect_dep = deps[desc->indirect_a ? 0 : 1].first; + if (dynamic_cast(indirect_dep) == nullptr) + return true; + + auto state_layout = indirect_dep->get_impl_params()->get_input_layout(0); + bool is_prefill = state_layout.count() == 0; + return !is_prefill; + } + + event::ptr execute_impl(const std::vector& events, gemm_inst& instance) override { + if (need_indirect_load(instance)) + return execute_stage(events, instance, indirect_gemm); + else + return execute_stage(events, instance, default_gemm); } public: - static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false, bool indirect = false) { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_shape_agnostic); @@ -43,6 +162,8 @@ struct gemm_impl : typed_primitive_impl_ocl { params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[i])); } + params.stage_id = indirect ? 1 : 0; + params.alpha = primitive->alpha; params.beta = primitive->beta; params.transpose_input0 = primitive->transpose_input0; @@ -51,6 +172,13 @@ struct gemm_impl : typed_primitive_impl_ocl { params.input1_order = primitive->input1_order; params.output_order = primitive->output_order; + params.indirect_input0 = primitive->indirect_a && indirect; + params.indirect_input1 = primitive->indirect_b && indirect; + if (indirect && (primitive->indirect_a || primitive->indirect_b)) { + OPENVINO_ASSERT(impl_param.input_layouts.size() >= 3, "[GPU] Actual inputs count: ", impl_param.input_layouts.size()); + params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[get_beam_table_id(primitive)])); + } + bool is_quantized = true; for (auto& input : impl_param.input_layouts) is_quantized &= data_type_traits::is_quantized(input.data_type); @@ -60,6 +188,25 @@ struct gemm_impl : typed_primitive_impl_ocl { } else { params.quantization = kernel_selector::QuantizationType::NONE; } + + params.set_dynamic_shape_offsets(); + if ((primitive->indirect_a || primitive->indirect_b) && !indirect) { + // Need to adjust regular gemm kernel offset to skip beam table input + for (auto& fd : params.fused_ops) { + if (!fd.has_outer_dep()) + continue; + auto& fused_op_inputs = fd.tensors; + for (auto& fused_input : fused_op_inputs) { + if (fused_input.is_dynamic()) + fused_input.SetDynamicShapeOffset(fused_input.get_dynamic_shape_offset() + kernel_selector::DataTensor::max_rank()); + } + } + for (auto& out : params.outputs) { + if (out.is_dynamic()) { + out.SetDynamicShapeOffset(out.get_dynamic_shape_offset() + kernel_selector::DataTensor::max_rank()); + } + } + } return {params, optional_params}; } @@ -84,9 +231,31 @@ struct gemm_impl : typed_primitive_impl_ocl { return static_canonicalize_shapes(impl_params); } + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { + std::vector kernels_data; + auto& kernel_selector = kernel_selector_t::Instance(); + auto params = static_canonicalize_shapes(impl_param); + + auto default_kernel_params = get_kernel_params(params, params.is_dynamic(), false); + default_kernel_params.first.is_shape_agnostic = params.is_dynamic(); + kernels_data.push_back(kernel_selector.get_best_kernel(default_kernel_params.first, default_kernel_params.second)); + const auto desc = params.typed_desc(); + if (desc->indirect_a || desc->indirect_b) { + auto indirect_kernel_params = get_kernel_params(params, params.is_dynamic(), true); + indirect_kernel_params.first.is_shape_agnostic = params.is_dynamic(); + kernels_data.push_back(kernel_selector.get_best_kernel(indirect_kernel_params.first, indirect_kernel_params.second)); + } + return cldnn::make_unique(kernels_data); + } + void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); + auto kernel_params = get_kernel_params(impl_param, true, false); + (_kernels_data[default_gemm].update_dispatch_data_func)(kernel_params.first, _kernels_data[default_gemm]); + + if (_kernels_data.size() == 2) { + auto kernel_params = get_kernel_params(impl_param, true, true); + (_kernels_data[indirect_gemm].update_dispatch_data_func)(kernel_params.first, _kernels_data[indirect_gemm]); + } } }; @@ -116,7 +285,7 @@ attach_gemm_impl::attach_gemm_impl() { format::bfwzyx, }; - implementation_map::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl::create, types, formats); + implementation_map::add(impl_types::ocl, shape_types::static_shape, gemm_impl::create, types, formats); const std::vector dyn_formats { format::bfyx, @@ -126,7 +295,7 @@ attach_gemm_impl::attach_gemm_impl() { implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, - typed_primitive_impl_ocl::create, types, dyn_formats); + gemm_impl::create, types, dyn_formats); } } // namespace detail diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 5030b1e2cfa0e4..1f6cbff3144e5e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -1411,6 +1411,7 @@ void set_default_params(const kernel_impl_params& param_info, kernel_selector::b const auto& output_layout = param_info.get_output_layout(0); params.is_shape_agnostic = is_shape_agnostic; + params.stage_id = 0; params.inputs[0] = convert_data_tensor(input_layout); params.outputs[0] = convert_data_tensor(output_layout); params.layerID = param_info.desc->id; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index fed5c318e16118..4b68a49d6a06aa 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -1,13 +1,19 @@ -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" +#include "intel_gpu/plugin/variable_state.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" -#include "primitive_base.hpp" +#include "intel_gpu/runtime/memory.hpp" +#include "multi_stage_primitive.hpp" #include "kv_cache_inst.h" #include "concatenation/concatenation_kernel_selector.h" #include "concatenation/concatenation_kernel_base.h" +#include "beam_table_update/beam_table_update_kernel_selector.hpp" +#include "beam_table_update/beam_table_update_kernel_ref.hpp" +#include "openvino/core/dimension.hpp" namespace cldnn { namespace ocl { @@ -42,50 +48,149 @@ kernel_selector::concat_axis convert_axis(int64_t axis, size_t rank) { } // namespace -struct kv_cache_impl : typed_primitive_impl_ocl { - using parent = typed_primitive_impl_ocl; +struct kv_cache_impl : multi_stage_primitive { + using parent = multi_stage_primitive; using parent::parent; using kernel_selector_t = kernel_selector::concatenation_kernel_selector; using kernel_params_t = std::pair; + using bt_kernel_selector_t = kernel_selector::beam_table_update_kernel_selector; + using bt_kernel_params_t = std::pair; + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::kv_cache_impl) std::unique_ptr clone() const override { return make_unique(*this); } + const size_t concat_stage = 0; + const size_t beam_table_stage = 1; + + cldnn::memory::ptr beam_table_prev = nullptr; + cldnn::memory::ptr beam_table_new = nullptr; + void load(BinaryInputBuffer& ib) override { parent::load(ib); if (is_dynamic()) { auto& kernel_selector = kernel_selector_t::Instance(); - auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName); - kernel_impl->GetUpdateDispatchDataFunc(_kernel_data); + auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName); + kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]); + if (_kernels_data.size() == 2) { + auto& bt_kernel_selector = bt_kernel_selector_t::Instance(); + auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName); + bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]); + } } } + void set_arguments_impl(kv_cache_inst& instance) override {} - kernel_arguments_data get_arguments(const kv_cache_inst& instance) const override { - kernel_arguments_data args = parent::get_arguments(instance); - - args.inputs = { instance.input_memory_ptr(0), instance.input_memory_ptr(1) }; + kernel_arguments_data get_arguments(const kv_cache_inst& instance, size_t stage) const override { + kernel_arguments_data args; + args.shape_info = instance.shape_info_memory_ptr(); + if (stage == concat_stage) { + args.inputs = { instance.input_memory_ptr(0), instance.input_memory_ptr(1) }; + args.outputs = { instance.output_memory_ptr(0) }; + } else if (stage == beam_table_stage) { + args.inputs = { beam_table_prev, instance.input_memory_ptr(2) }; + args.outputs = { beam_table_new }; + } return args; } + void execute_stage(const std::vector& events, kv_cache_inst& instance, std::vector& all_events, size_t stage) { + stream& stream = instance.get_network().get_stream(); + std::vector tmp_events(events); + size_t kernel_offset = 0; + for (size_t s = 0; s < stage; s++) { + kernel_offset += _kernels_data[s].kernels.size(); + } + for (size_t kd_idx = 0; kd_idx < _kernels_data[stage].kernels.size(); ++kd_idx) { + if (_kernels_data[stage].kernels[kd_idx].skip_execution) + continue; + + size_t idx_final = kernel_offset + kd_idx; + // If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr) + bool needs_completion_event = instance.needs_completion_event(); + + auto& params = _kernels_data[stage].kernels[kd_idx].params; + auto args = get_arguments(instance, stage); + args.scalars = ¶ms.scalars; + + for (const auto& m : instance.get_intermediates_memories()) { + args.intermediates.push_back(m); + } + + stream.set_arguments(*_kernels[idx_final], _kernels_data[stage].kernels[kd_idx].params, args); + + const auto& gws = params.workGroups.global; + const auto& lws = params.workGroups.local; + + GPU_DEBUG_TRACE_DETAIL << "Enqueue stage " << stage << " kernel " << idx_final << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] " + << "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]" + << (needs_completion_event ? " has_completion_event=true" : "") << std::endl; + + auto ev = stream.enqueue_kernel(*_kernels[idx_final], params, args, tmp_events, needs_completion_event); + if (_kernels_data[stage].needs_sub_kernels_sync) { + tmp_events = {ev}; + } + all_events.push_back(ev); + } + } + event::ptr execute_impl(const std::vector& events, kv_cache_inst& instance) override { const bool can_be_optimized = instance.get_impl_params()->_can_be_optimized; + auto& stream = instance.get_network().get_stream(); + auto& engine = instance.get_network().get_engine(); const auto& desc = instance.get_typed_desc(); auto& variable = instance.get_network().get_variable(desc->variable_info.variable_id); - auto res_event = parent::execute_impl(events, instance); - variable.set(); + std::vector res_events; + + execute_stage(events, instance, res_events, concat_stage); + + auto impl_param = *instance.get_impl_params(); + auto kv_shape = impl_param.input_layouts[0].get_partial_shape(); + if (desc->indirect && kv_shape[desc->gather_axis].get_length() > 1) { + const auto bt_alloc_type = engine.get_preferred_memory_allocation_type(false); + + auto beam_table_state = dynamic_cast(variable).get_beam_table_state(); + auto bt_layout = instance.get_impl_params()->output_layouts[1]; + auto bt_shape = bt_layout.get_shape(); + + std::swap(beam_table_prev, beam_table_new); + + if (!beam_table_new || beam_table_new->count() < ov::shape_size(bt_shape)) { + auto alloc_shape = bt_shape; + alloc_shape[desc->concat_axis] += instance.get_prealloc_iter_num(); + const layout bt_alloc_layout = {alloc_shape, bt_layout.data_type, bt_layout.format}; + GPU_DEBUG_TRACE_DETAIL << "Realloc beam table to " << bt_alloc_layout.to_short_string() << std::endl; + beam_table_new = engine.allocate_memory(bt_alloc_layout, bt_alloc_type, false); + + // Alloc prev mem too as it will be needed in the future + // That also simplifies arguments setting a little bit as we don't need to handle an optional past state + if (!beam_table_prev) { + beam_table_prev = engine.allocate_memory(bt_alloc_layout, bt_alloc_type, false); + } + } + + instance.set_output_memory(beam_table_new, false, 1); + beam_table_state->set_memory(beam_table_new, instance.get_impl_params()->output_layouts[1]); + + auto bt_kernel_params = get_bt_update_kernel_params(impl_param, beam_table_state->is_set()); + (_kernels_data[beam_table_stage].update_dispatch_data_func)(bt_kernel_params.first, _kernels_data[beam_table_stage]); + execute_stage(events, instance, res_events, beam_table_stage); + beam_table_state->set(); + } + + variable.set(); if (can_be_optimized) { GPU_DEBUG_TRACE_DETAIL << desc->id << " : Output is same as variable memory! Skip copying " << std::endl; // When primitive is optimized, concat kernel writes directly to variable memory - return res_event; + return aggregate_events(res_events, stream, res_events.size() > 1); } else { // Othwerise, we need to copy result from out buffer to state memory GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable meomry" << std::endl; - auto& stream = instance.get_network().get_stream(); stream.enqueue_barrier(); auto out = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(0), variable.get_memory()->get_layout()); @@ -93,13 +198,28 @@ struct kv_cache_impl : typed_primitive_impl_ocl { } } - static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + static layout get_beam_table_layout(const kernel_impl_params& impl_param) { + const auto& primitive = impl_param.typed_desc(); + auto kv_layout = impl_param.get_input_layout(0); + + // expected to be normalized already on primitive creation + auto concat_axis = primitive->concat_axis; + auto gather_axis = primitive->gather_axis; + + auto kv_shape = kv_layout.get_partial_shape(); + auto beam_table_shape = ov::PartialShape(std::vector(kv_shape.size(), 1)); + beam_table_shape[gather_axis] = kv_shape[gather_axis]; + beam_table_shape[concat_axis] = kv_shape[concat_axis]; + return layout{beam_table_shape, impl_param.output_layouts[1].data_type, format::get_default_format(beam_table_shape.size())}; + } + + static kernel_params_t get_concat_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_shape_agnostic); auto optional_params = get_default_optional_params(impl_param.get_program()); auto axis = primitive->concat_axis; - const auto inputs_count = primitive->input_size(); + const auto inputs_count = 2; params.inputs.resize(inputs_count); for (size_t i = 0; i < inputs_count; ++i) { params.inputs[i] = convert_data_tensor(impl_param.input_layouts[i]); @@ -108,13 +228,74 @@ struct kv_cache_impl : typed_primitive_impl_ocl { params.axis = convert_axis(axis, impl_param.get_output_layout().get_rank()); optional_params.kernelPerInput = true; + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past] + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(0)}, + {1, in_offsets_map.at(1)}, + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return {params, optional_params}; + } + + static bt_kernel_params_t get_bt_update_kernel_params(const kernel_impl_params& impl_param, bool is_state_set = false) { + auto params = get_default_params(impl_param, true); + auto optional_params = get_default_optional_params(impl_param.get_program()); + + auto inputs_count = 2; + auto bt_present_layout = impl_param.output_layouts[1]; + auto bt_shape = extend_shape_to_rank_from_end(bt_present_layout.get_partial_shape(), 1); + bt_present_layout.set_partial_shape(bt_shape); + layout bt_past_layout = get_beam_table_layout(impl_param); + + auto beam_idx_l = impl_param.input_layouts[2]; + beam_idx_l.set_partial_shape(extend_shape_to_rank_from_end(beam_idx_l.get_partial_shape(), 4)); + + params.inputs.resize(inputs_count); + params.inputs[0] = convert_data_tensor(bt_past_layout); + params.inputs[1] = convert_data_tensor(beam_idx_l); + params.outputs[0] = convert_data_tensor(bt_present_layout); + params.inputs.resize(inputs_count); + params.is_state_set = is_state_set; + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]] + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(3)}, // beam_table_past + {1, in_offsets_map.at(2)}, // beam_idx + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(1)}, // beam_table_present + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + return {params, optional_params}; } + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { + std::vector kernels_data; + auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); + auto& concat_kernel_selector = kernel_selector_t::Instance(); + kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params.first, concat_kernel_params.second)); + const bool indirect = impl_param.typed_desc()->indirect; + if (indirect) { + auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false); + auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance(); + kernels_data.push_back(bt_update_kernel_selector.get_best_kernel(bt_update_kernel_params.first, bt_update_kernel_params.second)); + } + return cldnn::make_unique(kernels_data); + } + void update_dispatch_data(const kernel_impl_params& impl_param) override { - auto kernel_params = get_kernel_params(impl_param, true); - (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); - _kernel_data.kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(0).count() == 0; + auto kv_cache_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[concat_stage].update_dispatch_data_func)(kv_cache_kernel_params.first, _kernels_data[concat_stage]); + _kernels_data[concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(0).count() == 0; } }; @@ -125,13 +306,13 @@ attach_kv_cache_impl::attach_kv_cache_impl() { auto formats = { format::bfyx }; implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, - typed_primitive_impl_ocl::create, + kv_cache_impl::create, types, formats); implementation_map::add(impl_types::ocl, shape_types::static_shape, - typed_primitive_impl_ocl::create, + kv_cache_impl::create, types, formats); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp new file mode 100644 index 00000000000000..658cdc88d18618 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp @@ -0,0 +1,233 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "intel_gpu/graph/serialization/cl_kernel_data_serializer.hpp" +#include "intel_gpu/graph/serialization/helpers.hpp" +#include "intel_gpu/graph/serialization/set_serializer.hpp" +#include "intel_gpu/graph/serialization/string_serializer.hpp" +#include "intel_gpu/graph/serialization/vector_serializer.hpp" +#include "intel_gpu/graph/program.hpp" + +#include "kernel_selector_common.h" +#include "openvino/core/except.hpp" +#include "primitive_inst.h" +#include "kernel_selector_helper.h" +#include "register.hpp" +#include "implementation_map.hpp" +#include "concatenation_inst.h" +#include "gather_inst.h" +#include "permute_inst.h" + +#include +#include +#include + +namespace cldnn { +namespace ocl { + +/* +Base class for GPU implementations which require multiple kernel selectors to be used and multiple kernel scheduled. +*/ +template +struct multi_stage_primitive : public typed_primitive_impl { + std::vector _kernels_data; + std::vector _kernels; + + // a pair of batch program hash and kernel entry hash of each ocl impl. + std::pair kernel_dump_info; + + multi_stage_primitive() : _kernels_data({}), _kernels({}) {} + + multi_stage_primitive(const multi_stage_primitive& other) + : typed_primitive_impl() + , _kernels_data(other._kernels_data) + , _kernels({}) { + _kernels.reserve(other._kernels.size()); + for (size_t k = 0; k < other._kernels.size(); ++k) { + _kernels.emplace_back(other._kernels[k]->clone()); + } + this->can_reuse_memory = false; + this->_kernel_name = other._kernel_name; + this->_is_dynamic = other._is_dynamic; + } + + multi_stage_primitive(const std::vector& kd) + : typed_primitive_impl() + , _kernels_data(kd) { + this->can_reuse_memory = false; + this->_kernel_name = kd[0].kernelName; + } + + bool is_cpu() const final { return false; } + + // Cache blob format: + // [ kernel_selector::kernel_data ] + // [ kernel_ids ] + void save(BinaryOutputBuffer& ob) const override { + primitive_impl::save(ob); + ob << _kernels_data.size(); + for (auto& kd : _kernels_data) { + ob << make_data(&kd.internalBufferDataType, sizeof(kernel_selector::Datatype)); + ob << kd.internalBufferSizes; + ob << kd.kernels; + ob << kd.kernelName; + } + } + + void load(BinaryInputBuffer& ib) override { + primitive_impl::load(ib); + size_t kernels_size; + ib >> kernels_size; + _kernels_data.resize(kernels_size); + for (size_t i = 0; i < kernels_size; i++) { + kernel_selector::kernel_data kd; + ib >> make_data(&kd.internalBufferDataType, sizeof(kernel_selector::Datatype)); + ib >> kd.internalBufferSizes; + ib >> kd.kernels; + ib >> kd.kernelName; + _kernels_data[i] = kd; + } + } + +protected: + virtual kernel_arguments_data get_arguments(const typed_primitive_inst& instance, size_t stage) const = 0; + + event::ptr aggregate_events(const std::vector& events, stream& stream, bool group = false, bool is_output = false) const { + if (events.size() == 1 && !is_output) + return events[0]; + + if (group && !is_output) + return stream.group_events(events); + + return events.empty() ? stream.create_user_event(true) + : stream.enqueue_marker(events, is_output); + } + + void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override { + _kernels.clear(); + if (!_kernels_data.empty() && !_kernels_data[0].kernels.empty()) { + auto compiled_kernels = kernels_cache.get_kernels(params); + size_t total_kernels = std::accumulate(_kernels_data.begin(), _kernels_data.end(), (size_t)0, + [](size_t acc, const kernel_selector::kernel_data& kd) { + return acc + kd.kernels.size(); + }); + OPENVINO_ASSERT(total_kernels == compiled_kernels.size(), "[GPU] Mismatch between number of expected and actually compiled kernels.\n", + "Expected: ", total_kernels, "\n" + "Got: ", compiled_kernels.size()); + _kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end()); + // batch program hash and kernel entry point to find corresponding cl source code + kernel_dump_info = std::make_pair(std::to_string(kernels_cache.get_kernel_batch_hash(params)), + _kernels_data[0].kernels[0].code.kernelString->entry_point); + for (size_t i = 1; i < _kernels_data[0].kernels.size(); ++i) + kernel_dump_info.second += " " + _kernels_data[0].kernels[i].code.kernelString->entry_point; + } + } + + void init_by_cached_kernels(const kernels_cache& kernels_cache, std::vector& cached_kernel_ids) override { + _kernels.clear(); + + _kernels.reserve(cached_kernel_ids.size()); + for (size_t k = 0; k < cached_kernel_ids.size(); ++k) { + _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(cached_kernel_ids[k])); + } + } + + std::vector get_cached_kernel_ids(const kernels_cache& kernels_cache) override { + return {kernels_cache.get_cached_kernel_ids(_kernels)}; + } + + std::vector get_kernels() const override { + return _kernels; + } + + std::vector get_internal_buffer_layouts_impl() const override { + std::vector layouts; + for (auto& kd : _kernels_data) { + if (kd.internalBufferSizes.empty()) + continue; + + auto dtype = from_data_type(kd.internalBufferDataType); + const auto bpp = data_type_traits::size_of(dtype); + for (auto size : kd.internalBufferSizes) { + layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel) + {1, 1, 1, (tensor::value_type)(size / bpp)}}; + layouts.push_back(inbuf_layout); + } + } + return layouts; + } + + void set_arguments_impl(typed_primitive_inst& instance) override { + if (instance.can_be_optimized()) { + return; + } + + for (size_t stage = 0; stage < _kernels_data.size(); stage++) { + auto& kd = _kernels_data[stage]; + stream& stream = instance.get_network().get_stream(); + for (size_t kd_idx = 0; kd_idx < kd.kernels.size(); ++kd_idx) { + if (kd.kernels[kd_idx].skip_execution) { + continue; + } + + auto args = get_arguments(instance, stage); + args.scalars = &kd.kernels[kd_idx].params.scalars; + + for (const auto& m : instance.get_intermediates_memories()) { + args.intermediates.push_back(m); + } + + stream.set_arguments(*_kernels[kd_idx], kd.kernels[kd_idx].params, args); + } + } + } + + void set_arguments_impl(typed_primitive_inst& instance, kernel_arguments_data& args) override { + OPENVINO_NOT_IMPLEMENTED; + } + + std::vector> get_kernels_source() override { + std::vector> kernel_strings; + for (auto& kd : _kernels_data) { + for (size_t i = 0; i < kd.kernels.size(); ++i) { + kernel_strings.push_back(kd.kernels[i].code.kernelString); + } + } + return kernel_strings; + } + + void reset_kernels_source() override { + for (auto& kd : _kernels_data) { + for (size_t i = 0; i < kd.kernels.size(); ++i) { + kd.kernels[i].code.kernelString.reset(); + } + } + } + + void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { + OPENVINO_ASSERT(kernels.size() == 1, "Only the kernels of the single primitive should be allowed."); + auto& kernel_vec = kernels.begin()->second; + _kernels.clear(); + _kernels.resize(kernel_vec.size()); + for (auto& k : kernel_vec) { + auto sub_kernel_idx = k.second; + _kernels[sub_kernel_idx] = k.first; + } + } + + std::vector get_kernels() override { + return _kernels; + } + + std::pair get_kernels_dump_info() const override { + return kernel_dump_info; + } +}; + +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h index 2c9e34efd30792..f40208a25e4c37 100644 --- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h @@ -5,6 +5,8 @@ #pragma once #include "intel_gpu/primitives/kv_cache.hpp" +#include "openvino/core/dimension.hpp" +#include "openvino/core/partial_shape.hpp" #include "primitive_inst.h" #include "variable.hpp" @@ -21,6 +23,20 @@ struct typed_program_node : public typed_program_node_base { program_node& input() const { return get_dependency(0); } std::vector get_shape_infer_dependencies() const override { return {}; } + + std::vector get_shape_info_input_layouts() const override { + std::vector res; + for (size_t i = 0; i < get_dependencies().size(); i++) { + const auto& d = get_dependency_with_port(i); + res.push_back(d.first->get_output_layout(false, d.second)); + } + + if (get_primitive()->indirect) { // insert an additional input with beam table past layout + res.push_back(layout(ov::PartialShape::dynamic(4), data_types::i32, format::bfyx)); + } + + return res; + } }; using kv_cache_node = typed_program_node; @@ -78,6 +94,7 @@ class typed_primitive_inst : public typed_primitive_inst_basememory_deps = get_const_memory_deps(); params->_can_be_optimized = this->optimized; + params->in_port_to_shape_info_offset = get_input_port_to_shape_info_offset_map(); + params->out_port_to_shape_info_offset = get_output_port_to_shape_info_offset_map(); auto deps = get_dependencies(); for (size_t i = 0; i < deps.size(); i++) { if (!deps[i].first->is_constant()) { @@ -176,6 +178,13 @@ struct program_node { ov::PartialShape get_input_pshape(size_t idx = 0) const; ov::PartialShape get_output_pshape(size_t idx = 0) const; + virtual std::vector get_shape_info_input_layouts() const; + std::map get_input_port_to_shape_info_offset_map() const; + std::map get_output_port_to_shape_info_offset_map() const; + size_t get_total_shape_info_input_size() const; + size_t get_total_shape_info_output_size() const; + size_t get_total_shape_info_size() const; + // replaces idx-th dependency of 'this' with 'new_dep', calls program::remove_if_dangling(old_dep) void replace_dependency(size_t idx, program_node& new_dep, bool remove_if_dangling = true); void replace_dependency(size_t idx, std::pair new_dep, bool remove_if_dangling = true); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index aaf4a1cbfd4cc9..843763a83b8c5e 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -4,6 +4,7 @@ #include "intel_gpu/op/kv_cache.hpp" #include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" #include "intel_gpu/runtime/optionals.hpp" #include "kv_cache_inst.h" #include "primitive_type_base.h" @@ -26,17 +27,24 @@ layout kv_cache_inst::calc_output_layout(const kv_cache_node& node, kernel_impl_ template std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*node*/, kernel_impl_params const& impl_param) { auto desc = impl_param.typed_desc(); - auto output_data_type = desc->output_data_types[0].value_or(impl_param.get_input_layout().data_type); ov::intel_gpu::op::KVCache op; + op.set_output_size(desc->num_outputs); op.set_concat_axis(desc->concat_axis); op.set_gather_axis(desc->gather_axis); std::vector input_shapes = {impl_param.get_input_layout(0).get(), impl_param.get_input_layout(1).get()}; - std::vector output_shapes = shape_infer(&op, input_shapes); - return {layout({output_shapes[0], output_data_type, impl_param.get_output_layout().format})}; + const std::map ports_map = {{0, 0}, {1, 2}}; + + std::vector out_layouts; + for (size_t i = 0; i < desc->num_outputs; i++) { + auto out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type); + out_layouts.push_back(layout(output_shapes[i], out_type, impl_param.get_output_layout(i).format)); + } + + return out_layouts; } template std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& node, const kernel_impl_params& impl_param); @@ -50,6 +58,7 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) { kv_cache_info.add("variable type", node.get_primitive()->variable_info.data_type); kv_cache_info.add("concat axis", node.get_primitive()->concat_axis); kv_cache_info.add("gather axis", node.get_primitive()->gather_axis); + kv_cache_info.add("indirect", node.get_primitive()->indirect); node_info->add("kv_cache info", kv_cache_info); std::stringstream primitive_description; node_info->dump(primitive_description); @@ -59,4 +68,46 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) { int32_t kv_cache_inst::get_prealloc_iter_num() { return 128 + kv_cache_id % 64; } + +void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { + mem_lock lock(_shape_info_memory, _network.get_stream()); + auto shape_info_ptr = lock.data(); + size_t offset = 0; + + std::vector> input_layouts; // [kv_state, kv_new_token, [beam_idx, bt_past]] + for (size_t i = 0; i < _node->get_dependencies().size(); i++) { + const auto& node_in_lay = _node->get_input_layout(i); + const auto& runtime_in_lay = params.input_layouts[i]; + + input_layouts.emplace_back(runtime_in_lay, node_in_lay); + } + + if (params.typed_desc()->indirect) { + auto& var = dynamic_cast(get_network().get_variable(variable_id())); + const auto& bt_state = var.get_beam_table_state(); + auto bt_layout = bt_state->get_layout(); + if (bt_layout.is_dynamic()) { + auto bt_shape = bt_layout.get_partial_shape(); + for (auto& d : bt_shape) { + if (d.is_dynamic()) + d = 0; + } + bt_layout.set_partial_shape(bt_shape); + } + input_layouts.emplace_back(bt_layout, bt_state->get_initial_layout()); + } + + for (size_t i = 0; i < input_layouts.size(); i++) { + GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i << "]" << std::endl; + fill_shape_info_data(input_layouts[i].first, input_layouts[i].second, shape_info_ptr, offset); + } + + for (size_t i = 0; i < _node->get_output_layouts().size(); i++) { + GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for output[" << i << "]" << std::endl; + const auto& node_out_lay = _node->get_output_layout(i); + const auto& runtime_out_lay = params.output_layouts[i]; + fill_shape_info_data(runtime_out_lay, node_out_lay, shape_info_ptr, offset); + } +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 3fe7d3b483b59f..6d2ac2678e4cf4 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -1334,7 +1334,8 @@ void network::allocate_primitive_instance(program_node const& node) { kv_cache_ids.push_back(node.id()); } if (auto state_prim = std::dynamic_pointer_cast(inst)) { - set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type()); + auto prim = inst->get_node().get_primitive(); + set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type(), prim.get()); } if (node.is_constant()) transfer_memory_to_device(inst, node); @@ -1372,7 +1373,7 @@ void network::transfer_memory_to_device(std::shared_ptr instance } } -void network::set_variable(const std::string& name, const std::shared_ptr& variable) { +void network::set_variable(const std::string& name, const std::shared_ptr& variable) { GPU_DEBUG_TRACE_DETAIL << "Set variable " << name << " " << variable->get_layout().to_short_string() << std::endl; _variables_states[name] = variable; } @@ -1381,11 +1382,12 @@ bool network::has_variable(const std::string &variable_id) const { return _variables_states.find(variable_id) != _variables_states.end(); } -ov::intel_gpu::VariableState& network::get_variable(const std::string &variable_id) const { +ov::intel_gpu::VariableStateBase& network::get_variable(const std::string &variable_id) const { auto it = _variables_states.find(variable_id); OPENVINO_ASSERT(it != _variables_states.end(), "[GPU] ", variable_id, " variable not found"); return *it->second; } + const ov::intel_gpu::VariableStateInfo& network::get_variable_info(const std::string &variable_id) const { auto it = _variables_state_info.find(variable_id); OPENVINO_ASSERT(it != _variables_state_info.end(), "[GPU] ", variable_id, " variable info not found"); @@ -1400,8 +1402,14 @@ const ov::intel_gpu::VariablesInfoMap& network::get_variables_info() const { return _variables_state_info; } -void network::set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type) { +void network::set_variables_state_info(const std::string& variable_id, + const layout& variable_layout, + ov::element::Type user_specified_type, + const primitive* p) { _variables_state_info.emplace(variable_id, ov::intel_gpu::VariableStateInfo{variable_id, variable_layout, user_specified_type}); + + _variables_state_info.at(variable_id).m_primitives.insert(p); } + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 31918b7c7ed078..69c7f28065dda1 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -1523,20 +1523,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool if (_impl->is_dynamic() && !_impl->is_cpu()) { GPU_DEBUG_TRACE_DETAIL << id() << ": initialize impl with dynamic impl " << _impl->get_kernel_name() << std::endl; _dynamic_impl = _impl->clone(); - // Actual shape info layout is the following: - // input_0 -> input_1, ..., fused_dep_0, fused_dep1, ..., output_0, output_1, ... - // For each tensor we save max_rank dimensions in [bfvuwzyx] order - size_t num_dynamic_pads = 0; - for (auto& in : _node->get_dependencies()) { - const auto& dyn_pad_dims = in.first->get_output_layout(false).data_padding.get_dynamic_pad_dims().sizes(); - num_dynamic_pads += std::accumulate(dyn_pad_dims.begin(), dyn_pad_dims.end(), static_cast(0)); - } - for (auto& o : _node->get_output_layouts()) { - const auto& dyn_pad_dims = o.data_padding.get_dynamic_pad_dims().sizes(); - num_dynamic_pads += std::accumulate(dyn_pad_dims.begin(), dyn_pad_dims.end(), static_cast(0)); - } - const int64_t buffers_count = _node->get_dependencies().size() + _node->get_outputs_count(); - const int64_t shape_elements = buffers_count * layout::max_rank() + num_dynamic_pads * 2 /*pad_before + pad_after*/; + const int64_t shape_elements = node.get_total_shape_info_size(); _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}); } } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 0329c996acd299..d4dc556b5e2c3e 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -33,6 +33,21 @@ using namespace cldnn; +static size_t get_shape_data_size(const layout& l) { + if (l.is_static()) + return 0; + + size_t size = layout::max_rank(); // all dimenstions are stored + auto dynamic_pad = l.data_padding.get_dynamic_pad_dims().sizes(format::get_default_format(layout::max_rank())); + for (size_t j = 0; j < layout::max_rank(); ++j) { + if (dynamic_pad[j] == 1) { + size += 2; // lower + upper + } + } + + return size; +} + thread_local size_t program_node::cur_id = 0; program_node::program_node(std::shared_ptr prim, program& prog) @@ -96,6 +111,62 @@ ov::PartialShape program_node::get_output_pshape(size_t idx) const { return get_output_layout(idx).get_partial_shape(); } +std::vector program_node::get_shape_info_input_layouts() const { + std::vector res; + for (size_t i = 0; i < get_dependencies().size(); i++) { + const auto& d = get_dependency_with_port(i); + res.push_back(d.first->get_output_layout(false, d.second)); + } + + return res; +} + +std::map program_node::get_input_port_to_shape_info_offset_map() const { + std::map res; + size_t offset = 0; + const auto& deps = get_shape_info_input_layouts(); + for (size_t i = 0; i < deps.size(); i++) { + res[i] = offset; + offset += get_shape_data_size(deps[i]); + } + + return res; +} + +std::map program_node::get_output_port_to_shape_info_offset_map() const { + std::map res; + size_t offset = get_total_shape_info_input_size(); + for (size_t i = 0; i < output_layouts.size(); i++) { + res[i] = offset; + offset += get_shape_data_size(output_layouts[i]); + } + + return res; +} + +size_t program_node::get_total_shape_info_input_size() const { + size_t offset = 0; + const auto& deps = get_shape_info_input_layouts(); + for (size_t i = 0; i < deps.size(); i++) { + offset += get_shape_data_size(deps[i]); + } + + return offset; +} + +size_t program_node::get_total_shape_info_output_size() const { + size_t offset = 0; + for (size_t i = 0; i < output_layouts.size(); i++) { + offset += get_shape_data_size(output_layouts[i]); + } + + return offset; +} + +size_t program_node::get_total_shape_info_size() const { + return get_total_shape_info_input_size() + get_total_shape_info_output_size(); +} + void program_node::replace_dependency(size_t idx, program_node& new_dep, bool remove_if_dangling) { return replace_dependency(idx, std::make_pair(&new_dep, 0), remove_if_dangling); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/beam_table_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/beam_table_update_ref.cl new file mode 100644 index 00000000000000..009cef79c25c53 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/beam_table_update_ref.cl @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/common.cl" + +KERNEL(beam_table_update)( + OPTIONAL_SHAPE_INFO_ARG + __global const INPUT0_TYPE* state_prev, + __global const INPUT1_TYPE* beam_idx, + __global OUTPUT_TYPE* state_new, + uchar is_state_set) +{ + const unsigned int b = (uint)get_global_id(0); + const unsigned int s = (uint)get_global_id(1); + + const unsigned int out_offset = b * OUTPUT_BATCH_PITCH + s; + const unsigned int in_offset = beam_idx[b] * INPUT0_BATCH_PITCH + s; + + if (s >= OUTPUT_BATCH_PITCH) + return; + + if (!is_state_set) { + state_new[out_offset] = TO_OUTPUT_TYPE(b); + } else { + if (s < INPUT0_BATCH_PITCH) { + state_new[out_offset] = state_prev[in_offset]; + } else { + state_new[out_offset] = b; + } + } +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl index 63af13c5d5e42e..893cc1552778f1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_ref.cl @@ -49,7 +49,26 @@ inline uint FUNC(get_input1_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER); } -#ifdef INPUT2_TYPE +#if BEAM_TABLE_TERM +inline uint FUNC(get_bt_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if BEAM_TABLE_SIMPLE + return GET_DATA_INDEX_6D_SAFE(BEAM_TABLE, b, f, w, z, y, x); +#else +# error gemm_ref.cl : Unsupported beam table format +#endif +} + +inline uint FUNC(get_bt_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if INDIRECT_INPUT0 + return FUNC_CALL(get_bt_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT0_DIMS_ORDER); +#else + return FUNC_CALL(get_bt_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER); +#endif +} + +#endif // BEAM_TABLE_TERM + +#ifdef BIAS_TERM inline uint FUNC(get_input2_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { #if INPUT2_SIMPLE return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, y, x); @@ -65,7 +84,7 @@ inline uint FUNC(get_input2_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint #endif #endif } -#endif // INPUT2_TYPE +#endif // BIAS_TERM #define INPUT0_SIZE_F INPUT0_FEATURE_NUM #define INPUT0_SIZE_B INPUT0_BATCH_NUM @@ -74,8 +93,11 @@ KERNEL(gemm_ref)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input0, const __global INPUT1_TYPE* input1, -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM const __global INPUT2_TYPE* input2, +#endif +#if BEAM_TABLE_TERM + const __global BEAM_TABLE_TYPE* beam_table, #endif __global OUTPUT_TYPE* output #if HAS_FUSED_OPS_DECLS @@ -100,8 +122,17 @@ KERNEL(gemm_ref)( ACCUMULATOR_TYPE acc = ACCUMULATOR_VAL_ZERO; for (uint ki = 0; ki < K; ++ki) { - uint in0_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, ki); - uint in1_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, ki, x); + uint b0 = b; + uint b1 = b; + #if INDIRECT_INPUT0 + b0 = BEAM_TABLE_BATCH_NUM > 1 ? beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, ki)] : b; + #endif + #if INDIRECT_INPUT1 + b1 = BEAM_TABLE_BATCH_NUM > 1 ? beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, ki, x)] : b; + #endif + + uint in0_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b0, f, w, z, y, ki); + uint in1_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b1, f, w, z, ki, x); ACCUMULATOR_TYPE val0 = TO_ACCUMULATOR_TYPE(input0[in0_idx]); ACCUMULATOR_TYPE val1 = TO_ACCUMULATOR_TYPE(input1[in1_idx]); @@ -111,7 +142,7 @@ KERNEL(gemm_ref)( acc = TO_ACCUMULATOR_TYPE(ALPHA) * acc; -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM { uint in2_idx = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x); ACCUMULATOR_TYPE val2 = TO_ACCUMULATOR_TYPE(input2[in2_idx]); @@ -134,4 +165,4 @@ KERNEL(gemm_ref)( } #undef INPUT0_SIZE_F -#undef INPUT0_SIZE_B \ No newline at end of file +#undef INPUT0_SIZE_B diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index c9183822ccb803..ba775f9bd1192e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -52,7 +52,40 @@ inline uint FUNC(get_input1_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint return FUNC_CALL(get_input1_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER); } -#ifdef INPUT2_TYPE +#if BEAM_TABLE_TERM +inline uint FUNC(get_bt_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if BEAM_TABLE_SIMPLE + return GET_DATA_INDEX_6D_SAFE(BEAM_TABLE, b, f, w, z, y, x); +#else +# error gemm_tiled_ops.cl : Unsupported beam table format +#endif +} + +inline uint FUNC(get_bt_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { +#if INDIRECT_INPUT0 + return FUNC_CALL(get_bt_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT0_DIMS_ORDER); +#else + return FUNC_CALL(get_bt_index_nt)(OPTIONAL_SHAPE_INFO_TENSOR INPUT1_DIMS_ORDER); +#endif +} + +#endif // BEAM_TABLE_TERM + +#if INDIRECT_INPUT0 +inline uint FUNC(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x, __global BEAM_TABLE_TYPE* beam_table) { + int b_index = BEAM_TABLE_BATCH_NUM > 1 ? beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x)] : b; + return FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b_index, f, w, z, y, x); +} +#endif + +#if INDIRECT_INPUT1 +inline uint FUNC(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x, __global BEAM_TABLE_TYPE* beam_table) { + int b_index = BEAM_TABLE_BATCH_NUM > 1 ? beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x)] : b; + return FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_index, f, w, z, y, x); +} +#endif + +#ifdef BIAS_TERM inline uint FUNC(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) { #if INPUT2_SIMPLE return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, 0, 0); @@ -60,7 +93,7 @@ inline uint FUNC(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f # error gemm_tiled_opt.cl : Unsupported input 2 format #endif // INPUT2_SIMPLE } -#endif // INPUT2_TYPE +#endif // BIAS_TERM #define VLOAD CAT(vload, SIMD_WIDTH) @@ -71,9 +104,12 @@ KERNEL(gemm_tiled_opt)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input0, const __global INPUT1_TYPE* input1, -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM const __global INPUT2_TYPE* input2, -#endif // INPUT2_TYPE +#endif // BIAS_TERM +#if BEAM_TABLE_TERM + const __global BEAM_TABLE_TYPE* beam_table, +#endif __global OUTPUT_TYPE* output #if HAS_FUSED_OPS_DECLS , FUSED_OPS_DECLS @@ -113,9 +149,9 @@ KERNEL(gemm_tiled_opt)( // Batch offsets const uint batch_offset_input0 = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, 0); const uint batch_offset_input1 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 0, tile_n_offset); -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z); -#endif // INPUT2_TYPE +#endif // BIAS_TERM uint write_id = 0; const uint batch_offset_output = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X); write_id = 1; @@ -159,13 +195,17 @@ KERNEL(gemm_tiled_opt)( #endif const uint input1_fetch_size = ((N - tile_n_offset) < TILE_K) ? (N - tile_n_offset) : TILE_K; #endif // TRANSPOSE_INPUT1 -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM const __global INPUT2_TYPE* c_ptr = input2 + batch_offset_input2 + tile_m_offset * N + tile_n_offset; -#endif // INPUT2_TYPE +#endif // BIAS_TERM __global OUTPUT_TYPE* d_ptr = output + batch_offset_output; const uint b_raw_global_id = tile_n_offset + sglid; +#if INDIRECT_INPUT0 || INDIRECT_INPUT1 + const char do_indirect_load = BEAM_TABLE_BATCH_NUM > 1; +#endif + #if TRANSPOSE_INPUT0 != TRANSPOSE_X_LAST MAKE_VECTOR_TYPE(INPUT0_TYPE, SIMD_WIDTH) a_tile; #endif // TRANSPOSE_INPUT0 != TRANSPOSE_X_LAST @@ -185,51 +225,110 @@ KERNEL(gemm_tiled_opt)( // Loading B tile unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) { -#if IS_DYNAMIC -#if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST -#if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING - b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; -#else - b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0); +#if INDIRECT_INPUT1 + uint b_load_offset = (k * TILE_K) + b_load_id; #endif - b_ptr += input1_offset; +#if IS_DYNAMIC + #if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST + #if INDIRECT_INPUT1 + if (do_indirect_load) + { + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + else + #endif + { + #if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; + #else + b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0); + #endif + b_ptr += input1_offset; + } #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST if (b_raw_global_id > N - 1) { b_tile[b_load_id] = 0; } else { - uint b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + k * TILE_K), x); + uint b_idx = 0; +#if INDIRECT_INPUT1 + if (do_indirect_load) + { + b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + } + else +#endif // INDIRECT_INPUT1 + { + b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + k * TILE_K), x); + } b_tile[b_load_id] = input1[b_idx]; } #endif // TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST #else // IS_DYNAMIC #if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST -#if TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; -#else // TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); -#endif // TILE_N_NOT_DIVISIBLE - b_ptr += input1_offset; + #if INDIRECT_INPUT1 + if (do_indirect_load) + { + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + else + #endif // INDIRECT_INPUT1 + { + #if TILE_N_NOT_DIVISIBLE + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; + #else + b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + #endif + b_ptr += input1_offset; + } #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST if (b_raw_global_id > N - 1) { b_tile[b_load_id] = 0; } else { - uint b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + k * TILE_K), x); + uint b_idx = 0; +#if INDIRECT_INPUT1 + if (do_indirect_load) + { + b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + } + else +#endif // INDIRECT_INPUT1 + { + b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + k * TILE_K), x); + } b_tile[b_load_id] = input1[b_idx]; } #endif // TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST #endif // IS_DYNAMIC } // Loading B tile end #if TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST - b_ptr = b_ptr + (input1_offset * sglid); - b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr) : 0; - b_ptr = b_ptr + input1_offset1 - (input1_offset * sglid); +#if INDIRECT_INPUT1 + if (do_indirect_load) + { + unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) { + uint b_load_offset = (k * TILE_K) + b_load_id; + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + } + else +#endif + { + b_ptr = b_ptr + (input1_offset * sglid); + b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr) : 0; + b_ptr = b_ptr + input1_offset1 - (input1_offset * sglid); + } #endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST // Loading A tile and tile C calculation unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { #if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST #if IS_DYNAMIC -#if HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING +#if INDIRECT_INPUT0 + uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table); + A_FLOATN a_read = input0[a_idx]; +#elif HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING // In case of dynamic padding we can't guarantee memory access alignment for // block reads (4 bytes), so use scattered read uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid)); @@ -238,7 +337,10 @@ KERNEL(gemm_tiled_opt)( A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0); #endif #else // IS_DYNAMIC -#if TILE_K_NOT_DIVISIBLE +#if INDIRECT_INPUT0 + uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table); + A_FLOATN a_read = input0[a_idx]; +#elif TILE_K_NOT_DIVISIBLE A_FLOATN a_read = a_ptr[sglid]; #else // TILE_K_NOT_DIVISIBLE A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0); @@ -257,7 +359,11 @@ KERNEL(gemm_tiled_opt)( } } #elif TRANSPOSE_INPUT0 == TRANSPOSE_OTHER // TRANSPOSE_INPUT0 +#if INDIRECT_INPUT0 + uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table); +#else uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid)); +#endif a_tile[dot_id] = input0[a_idx]; #endif // TRANSPOSE_INPUT0 } // Loading A tile and tile C calculation end @@ -266,9 +372,16 @@ KERNEL(gemm_tiled_opt)( a_ptr = a_ptr + input0_offset1 - (input0_offset * tile_m_iterations); #else // TRANSPOSE_INPUT0 #if TRANSPOSE_INPUT0 == TRANSPOSE_Y_LAST - a_ptr = a_ptr + (input0_offset * sglid); - a_tile = VLOAD(0, a_ptr); - a_ptr = a_ptr + input0_offset1 - (input0_offset * sglid); + #if INDIRECT_INPUT0 + unroll_for (uint a_load_id = 0; a_load_id < SIMD_WIDTH; a_load_id++) { + uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + a_load_id), (k * TILE_K + sglid), beam_table); + a_tile[a_load_id] = input0[a_idx]; + } + #else + a_ptr = a_ptr + (input0_offset * sglid); + a_tile = VLOAD(0, a_ptr); + a_ptr = a_ptr + input0_offset1 - (input0_offset * sglid); + #endif #endif // Tile C calculation for TN, TT cases @@ -285,30 +398,70 @@ KERNEL(gemm_tiled_opt)( if (TILE_K_NOT_DIVISIBLE) { // Loading leftovers of the matrix B unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) { + #if INDIRECT_INPUT1 + uint b_load_offset = (K_FULL_ITERATIONS * TILE_K) + b_load_id; + #endif #if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST + #if INDIRECT_INPUT1 + if (do_indirect_load) + { + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + else + #endif + { #if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING - b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else - b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0); + b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0); #endif - b_ptr += input1_offset; + b_ptr += input1_offset; + } #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0 if (b_raw_global_id > N - 1) { b_tile[b_load_id] = 0; } else { - uint b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + K_FULL_ITERATIONS * TILE_K), x); + uint b_idx = 0; +#if INDIRECT_INPUT1 + if (do_indirect_load) + { + b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + } + else +#endif + { + b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + K_FULL_ITERATIONS * TILE_K), x); + } b_tile[b_load_id] = input1[b_idx]; } #endif } // Loading leftovers of the matrix B end #if TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST - b_ptr = b_ptr + (input1_offset * sglid); - b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr) : 0; + #if INDIRECT_INPUT1 + if (do_indirect_load) + { + unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) { + uint b_load_offset = (K_FULL_ITERATIONS * TILE_K) + b_load_id; + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + } + else + #endif + { + b_ptr = b_ptr + (input1_offset * sglid); + b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr) : 0; + } #endif // TRANSPOSE_INPUT1 // Loading leftovers of the matrix A and tile C calculation unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { +#if INDIRECT_INPUT0 + uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table); +#else uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid)); +#endif INPUT0_TYPE a_read = input0[a_idx]; unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) { @@ -320,30 +473,69 @@ KERNEL(gemm_tiled_opt)( #if TILE_K_NOT_DIVISIBLE // Loading leftovers of the matrix B unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) { + #if INDIRECT_INPUT1 + uint b_load_offset = (K_FULL_ITERATIONS * TILE_K) + b_load_id; + #endif #if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST + #if INDIRECT_INPUT1 + if (do_indirect_load) + { + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + else + #endif + { #if TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); #endif // TILE_N_NOT_DIVISIBLE - b_ptr += input1_offset; + b_ptr += input1_offset; + } #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0 if (b_raw_global_id > N - 1) { b_tile[b_load_id] = 0; } else { - uint b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + K_FULL_ITERATIONS * TILE_K), x); + uint b_idx = 0; + #if INDIRECT_INPUT1 + if (do_indirect_load) + { + b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + } + else + #endif + { + b_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (b_load_id + K_FULL_ITERATIONS * TILE_K), x); + } b_tile[b_load_id] = input1[b_idx]; } #endif } // Loading leftovers of the matrix B end #if TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST - b_ptr = b_ptr + (input1_offset * sglid); - b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr) : 0; - #endif // TRANSPOSE_INPUT1 + #if INDIRECT_INPUT1 + if (do_indirect_load) { + unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) { + uint b_load_offset = (K_FULL_ITERATIONS * TILE_K) + b_load_id; + uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table); + b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx]; + } + } + else + #endif + { + b_ptr = b_ptr + (input1_offset * sglid); + b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr) : 0; + } + #endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST // Loading leftovers of the matrix A and tile C calculation unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { +#if INDIRECT_INPUT0 + uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table); +#else uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid)); +#endif INPUT0_TYPE a_read = input0[a_idx]; unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) { @@ -369,11 +561,11 @@ KERNEL(gemm_tiled_opt)( unroll_for (uint write_id = 0; write_id < tile_m_iterations; write_id++) { #if IS_DYNAMIC if (b_raw_global_id < N) { -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid]; -#else // INPUT2_TYPE +#else // BIAS_TERM ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id]; -#endif // INPUT2_TYPE +#endif // BIAS_TERM #if HAS_FUSED_OPS #if FUSED_OPS_CAN_USE_PRELOAD @@ -390,11 +582,11 @@ KERNEL(gemm_tiled_opt)( #else // IS_DYNAMIC #if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1 if (b_raw_global_id < N) { -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid]; -#else // INPUT2_TYPE +#else // BIAS_TERM ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id]; -#endif // INPUT2_TYPE +#endif // BIAS_TERM #if HAS_FUSED_OPS #if FUSED_OPS_CAN_USE_PRELOAD @@ -411,12 +603,12 @@ KERNEL(gemm_tiled_opt)( #else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1 -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM B_FLOATN c_val = BLOCK_READ_B(c_ptr, 0); ACCUMULATOR_TYPE_VEC dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_val; -#else // INPUT2_TYPE +#else // BIAS_TERM ACCUMULATOR_TYPE_VEC dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id]; -#endif // INPUT2_TYPE +#endif // BIAS_TERM #if HAS_FUSED_OPS #if FUSED_OPS_CAN_USE_PRELOAD @@ -433,9 +625,9 @@ KERNEL(gemm_tiled_opt)( #endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1 #endif // IS_DYNAMIC d_ptr += batch_offset_output_diff; -#ifdef INPUT2_TYPE +#ifdef BIAS_TERM c_ptr += N; -#endif // INPUT2_TYPE +#endif // BIAS_TERM } // Writing result in the global memory end } diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index 0f026a77ae855c..8952d5b1417bc0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -14,6 +14,7 @@ namespace kernel_selector { enum class KernelType { UNKNOWN, ARG_MAX_MIN, + BEAM_TABLE_UPDATE, CONVOLUTION, DECONVOLUTION, DFT, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp index 52853d7012f1cf..99d85e32462a98 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp @@ -78,7 +78,7 @@ std::string KernelBaseOpenCL::GetEntryPoint(const std::string& templateName, std::replace(kernelID.begin(), kernelID.end(), '/', '_'); // UniqueID = program_id + processing_index + additional weight/reorder tag - kernelID += "_" + params.uniqueID + "_" + std::to_string(partID); + kernelID += "_" + params.uniqueID + "_" + std::to_string(partID) + "_" + std::to_string(params.stage_id); // Add "__sa" suffix for shape agnostic kernels if (params.is_shape_agnostic) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h index dec0d3476ce9a5..05b6ef5a0b8aec 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -234,6 +235,9 @@ class ParamsKey { struct quantize_t { uint32_t scale_shift_opt : 1; } quantize; + struct gemm_t { + uint32_t indirect : 1; + } gemm; } dedicated; } val; uint64_t raw; @@ -320,7 +324,7 @@ class ParamsKey { void EnableDeformableMask() { key.restrict.val.dedicated.conv.deformable_mask_enabled = 1; } void EnableQuantizeScaleShiftOpt() { key.restrict.val.dedicated.quantize.scale_shift_opt = 1; } - + void EnableIndirectGemm() { key.restrict.val.dedicated.gemm.indirect = 1; } void EnableWinogradReorder() { key.restrict.val.dedicated.reorder.winograd = 1; } void EnableRotateReorder() { key.restrict.val.dedicated.reorder.rotate = 1; } void EnableSurfaceInputSupport() { key.restrict.val.dedicated.reorder.surface_input = 1; } @@ -400,6 +404,10 @@ struct Params { virtual void set_dynamic_shape_offsets() { return; } + virtual void set_dynamic_shape_offsets(std::map in_tensor_to_offset_map, std::map out_tensor_to_offset_map) { + return; + } + protected: Params(KernelType kt, const std::string& id) : kType(kt), layerID(id), is_shape_agnostic(false) {} KernelType kType; @@ -410,6 +418,7 @@ struct Params { EngineInfo engineInfo; std::string uniqueID; bool is_shape_agnostic; + size_t stage_id; virtual std::string to_string() const; virtual std::string to_cache_string_v2() const; }; @@ -681,6 +690,22 @@ struct base_params : public Params { } } + void set_dynamic_shape_offsets(std::map in_tensor_to_offset_map, std::map out_tensor_to_offset_map) override { + for (size_t i = 0; i < inputs.size(); i++) { + auto& in = inputs[i]; + OPENVINO_ASSERT(in_tensor_to_offset_map.count(i) > 0, "[GPU] set_dynamic_shape_offsets expects all input tensors have mapping to the offset"); + size_t offset = in_tensor_to_offset_map.at(i); + in.SetDynamicShapeOffset(offset); + } + OPENVINO_ASSERT(fused_ops.empty(), "[GPU] set_dynamic_shape_offsets with mappings doesn't support fused ops for now"); + for (size_t i = 0; i < outputs.size(); i++) { + auto& out = outputs[i]; + OPENVINO_ASSERT(out_tensor_to_offset_map.count(i) > 0, "[GPU] set_dynamic_shape_offsets expects all output tensors have mapping to the offset"); + size_t offset = out_tensor_to_offset_map.at(i); + out.SetDynamicShapeOffset(offset); + } + } + protected: explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1) {} }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_ref.cpp new file mode 100644 index 00000000000000..280687e577e03c --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_ref.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "beam_table_update_kernel_ref.hpp" + +#include "kernel_selector_params.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + +void BeamTableUpdateKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [](const Params& params, KernelData& kd) { + const auto& prim_params = dynamic_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + ScalarDescriptor is_state_set; + + is_state_set.t = ScalarDescriptor::Types::UINT8; + is_state_set.v.u8 = prim_params.is_state_set ? 1 : 0; + kd.kernels[0].params.scalars.resize(1); + kd.kernels[0].params.scalars[0] = is_state_set; + }; +} + +KernelsData BeamTableUpdateKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { + if (!Validate(params, options)) { + return {}; + } + + auto kernel_data = KernelData::Default(params); + const auto& kernel_params = dynamic_cast(*kernel_data.params); + const auto dispatch_data = SetDefault(kernel_params); + const auto entry_point = GetEntryPoint(kernelName, kernel_params.layerID, params, options); + const auto jit_constants = GetJitConstants(kernel_params); + const auto jit = CreateJit(kernelName, jit_constants, entry_point); + auto& kernel = kernel_data.kernels.front(); + + GetUpdateDispatchDataFunc(kernel_data); + + FillCLKernelData(kernel, + dispatch_data, + params.engineInfo, + kernelName, + jit, + entry_point, + {}, + false, + false, + static_cast(kernel_params.inputs.size()), + GetFusedPrimitiveInputsCount(kernel_params), + static_cast(kernel_params.outputs.size()), + kernel_params.outputs[0].is_dynamic()); + + ScalarDescriptor is_state_set; + is_state_set.t = ScalarDescriptor::Types::UINT8; + is_state_set.v.u8 = 0; + kernel.params.scalars.push_back(is_state_set); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + + return {kernel_data}; +} + +ParamsKey BeamTableUpdateKernelRef::GetSupportedKey() const { + ParamsKey key; + key.EnableInputDataType(Datatype::INT32); + key.EnableOutputDataType(Datatype::INT32); + key.EnableInputLayout(DataLayout::bfyx); + key.EnableOutputLayout(DataLayout::bfyx); + key.EnableTensorOffset(); + key.EnableTensorPitches(); + key.EnableBatching(); + key.EnableDynamicShapesSupport(); + return key; +} + +bool BeamTableUpdateKernelRef::Validate(const Params& params, const optional_params& options) const { + if (params.GetType() != KernelType::BEAM_TABLE_UPDATE || options.GetType() != KernelType::BEAM_TABLE_UPDATE) { + return false; + } + + const auto& kernel_params = dynamic_cast(params); + if (kernel_params.inputs.size() != 2) { + return false; + } + if (kernel_params.outputs.size() != 1) { + return false; + } + + return true; +} + +JitConstants BeamTableUpdateKernelRef::GetJitConstants(const beam_table_update_params& kernel_params) const { + return MakeBaseParamsJitConstants(kernel_params); +} + +CommonDispatchData BeamTableUpdateKernelRef::SetDefault(const beam_table_update_params& kernel_params) { + CommonDispatchData dispatch_data; + + auto output = kernel_params.outputs[0]; + if (!output.is_dynamic()) { + dispatch_data.gws = {output.Batch().v, Align(output.LogicalSize() / output.Batch().v, 16), 1}; + dispatch_data.lws = {1, 16, 1}; + } + + return dispatch_data; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_ref.hpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_ref.hpp new file mode 100644 index 00000000000000..bb5ea9d6b66eb3 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_ref.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" + +namespace kernel_selector { + +struct beam_table_update_params : base_params { + beam_table_update_params() : base_params(KernelType::BEAM_TABLE_UPDATE) {} + bool is_state_set = false; +}; + +struct beam_table_update_optional_params : optional_params { + beam_table_update_optional_params() : optional_params(KernelType::BEAM_TABLE_UPDATE) {} +}; + +class BeamTableUpdateKernelRef : public KernelBaseOpenCL { +public: + BeamTableUpdateKernelRef() : KernelBaseOpenCL{"beam_table_update_ref"} {} + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params& params, const optional_params& options) const override; + JitConstants GetJitConstants(const beam_table_update_params& kernel_params) const; + static CommonDispatchData SetDefault(const beam_table_update_params& kernel_params); + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_selector.cpp new file mode 100644 index 00000000000000..81945de9c60e93 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_selector.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "beam_table_update_kernel_selector.hpp" + +#include "beam_table_update_kernel_ref.hpp" + +namespace kernel_selector { + +beam_table_update_kernel_selector::beam_table_update_kernel_selector() { + Attach(); +} + +KernelsData beam_table_update_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { + return GetNaiveBestKernel(params, options, KernelType::BEAM_TABLE_UPDATE); +} + +beam_table_update_kernel_selector& beam_table_update_kernel_selector::Instance() { + static beam_table_update_kernel_selector instance; + return instance; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_selector.hpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_selector.hpp new file mode 100644 index 00000000000000..5a11603f7ff589 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/beam_table_update/beam_table_update_kernel_selector.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { + +class beam_table_update_kernel_selector : public kernel_selector_base { +public: + beam_table_update_kernel_selector(); + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + static beam_table_update_kernel_selector& Instance(); +}; + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp index eb61f0156fb4e8..3d534b82d2dc52 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp @@ -178,6 +178,9 @@ JitConstants GemmKernelBase::GetJitConstants(const gemm_params& params) const { MakeJitConstant("TRANSPOSE_INPUT0", params.transpose_input0), MakeJitConstant("TRANSPOSE_INPUT1", params.transpose_input1), MakeJitConstant("QUANTIZATION_TERM", params.quantization != QuantizationType::NONE), + MakeJitConstant("INDIRECT_INPUT0", params.indirect_input0), + MakeJitConstant("INDIRECT_INPUT1", params.indirect_input1), + MakeJitConstant("BEAM_TABLE_TERM", params.indirect_input0 || params.indirect_input1), }); auto get_output_size = [this](const std::vector& output_order_idx, const int target_idx) { @@ -200,6 +203,13 @@ JitConstants GemmKernelBase::GetJitConstants(const gemm_params& params) const { return ""; } }; + if (params.indirect_input0 || params.indirect_input1) { + jit.AddConstant(MakeJitConstant("BEAM_TABLE", params.inputs[params.inputs.size() - 1])); + } + + if (params.inputs.size() == 4 || (!params.indirect_input0 && !params.indirect_input1 && params.inputs.size() == 3)) { + jit.AddConstant(MakeJitConstant("BIAS_TERM", 1)); + } jit.AddConstants({ MakeJitConstant("TRANSPOSE_X_LAST", 0), diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h index aa0aef24e3e982..bffdd9a12c1ac8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.h @@ -22,11 +22,17 @@ struct gemm_params : public base_params { std::vector input0_order; std::vector input1_order; std::vector output_order; + DataTensor beam_table; + bool indirect_input0 = false; + bool indirect_input1 = false; QuantizationType quantization = QuantizationType::NONE; ParamsKey GetParamsKey() const override { ParamsKey k = base_params::GetParamsKey(); k.EnableQuantization(quantization); + + if (indirect_input0 || indirect_input1) + k.EnableIndirectGemm(); return k; } }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp index 2c849033c19f45..4f1e3660f9575e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_ref.cpp @@ -28,6 +28,7 @@ ParamsKey GemmKernelRef::GetSupportedKey() const { k.EnableTensorOffset(); k.EnableQuantization(QuantizationType::SYMMETRIC); k.EnableDynamicShapesSupport(); + k.EnableIndirectGemm(); return k; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index 4044f1400b2a59..2004b42b6be16d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -12,6 +12,7 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const { k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::INT8); @@ -28,6 +29,7 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const { k.EnableBatching(); k.EnableDifferentTypes(); k.EnableDynamicShapesSupport(); + k.EnableIndirectGemm(); return k; } @@ -314,7 +316,8 @@ bool GemmKernelTiledOpt::Validate(const Params& params, const optional_params& o if (gmm_params.outputs[0].PitchesDifferFromLogicalDims()) return false; - for (size_t input_idx = 0; input_idx < gmm_params.inputs.size(); ++input_idx) { + size_t num_inputs = (gmm_params.indirect_input0 || gmm_params.indirect_input1) ? gmm_params.inputs.size() - 1 : gmm_params.inputs.size(); + for (size_t input_idx = 0; input_idx < num_inputs; ++input_idx) { auto& input = gmm_params.inputs[input_idx]; if (!Tensor::SimpleLayout(input.GetLayout())) { return false; @@ -337,7 +340,7 @@ bool GemmKernelTiledOpt::Validate(const Params& params, const optional_params& o if (gmm_params.has_dynamic_inputs() && !gmm_params.is_shape_agnostic) return false; - for (size_t i = 1; i < gmm_params.inputs.size(); i++) + for (size_t i = 1; i < num_inputs; i++) if (gmm_params.inputs[0].GetDType() != gmm_params.inputs[i].GetDType()) return false; diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp new file mode 100644 index 00000000000000..7574b664b6b4b7 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -0,0 +1,156 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/make_tensor.hpp" +#include "openvino/runtime/tensor.hpp" +#include "intel_gpu/plugin/variable_state.hpp" +#include "intel_gpu/plugin/remote_context.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/remote_tensor.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" +#include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/runtime/memory_caps.hpp" +#include "intel_gpu/runtime/layout.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" + +#include + +namespace ov { +namespace intel_gpu { + +MultiTensorState::MultiTensorState(const std::vector& infos, + std::shared_ptr context, + ShapePredictor::Ptr shape_predictor) : ov::intel_gpu::VariableStateBase(infos[0].m_id, context) { + for (auto& info : infos) { + m_hidden_states.push_back(std::make_shared(info, context, shape_predictor)); + } +} + +VariableStateIndirectKVCache::VariableStateIndirectKVCache(const VariableStateInfo& info, + RemoteContextImpl::Ptr context, + std::shared_ptr shape_predictor, + size_t beam_axis, + size_t concat_axis) + : MultiTensorState { {info}, context, shape_predictor} + , m_beam_axis(beam_axis) + , m_concat_axis(concat_axis) { + cldnn::layout beam_table_layout(get_beam_table_shape(info.m_layout.get_partial_shape()), ov::element::i32, cldnn::format::bfyx); + VariableStateInfo beam_table_state_info(info.m_id + "/beam_table", beam_table_layout); + m_hidden_states.push_back(std::make_shared(beam_table_state_info, context, shape_predictor)); + OPENVINO_ASSERT(m_hidden_states.size() == 2, "[GPU] VariableStateIndirectKVCache expects 2 internal states to be initialized"); +} + +void VariableStateIndirectKVCache::reset() { + for (auto& state : m_hidden_states) { + state->reset(); + } + m_is_set = false; +} + +cldnn::memory::ptr VariableStateIndirectKVCache::get_memory() const { + return m_hidden_states[0]->get_memory(); +} + +const cldnn::layout& VariableStateIndirectKVCache::get_layout() const { + return m_hidden_states[0]->get_layout(); +} + +void VariableStateIndirectKVCache::set_state(const ov::SoPtr& state) { + OPENVINO_ASSERT(m_hidden_states.size() == 2, "[GPU] Corrupted VariableStateIndirectKVCache. Expected 2 internal states. Got: ", m_hidden_states.size()); + m_hidden_states[0]->set_state(state); // user can set only KV cache + + // Beam table is reset to cleanup rearranges history + cldnn::layout bt_layout(get_beam_table_shape(state->get_shape()), ov::element::i32, cldnn::format::bfyx); + m_hidden_states[1]->reset(); + m_hidden_states[1]->set_layout(bt_layout); +} + +template +void copy_element(const void* src, void* dst, size_t src_offset, size_t dst_offset) { + static_cast(dst)[dst_offset] = static_cast(src)[src_offset]; +} + +static void rearrange_cache(cldnn::memory::ptr kv_in_mem, cldnn::memory::ptr bt_mem, cldnn::memory::ptr kv_out_mem, cldnn::stream& stream, size_t concat_axis) { + auto kv_layout = kv_in_mem->get_layout(); + auto kv_shape = kv_layout.get_shape(); + cldnn::mem_lock kv_in_ptr(kv_in_mem, stream); + cldnn::mem_lock bt_in_ptr(bt_mem, stream); + cldnn::mem_lock kv_out_ptr(kv_out_mem, stream); + + OPENVINO_ASSERT(kv_shape.size() == 4); + + for (size_t b = 0; b < kv_shape[0]; b++) { + for (size_t f = 0; f < kv_shape[1]; f++) { + for (size_t y = 0; y < kv_shape[2]; y++) { + for (size_t x = 0; x < kv_shape[3]; x++) { + size_t b_kv = bt_in_ptr[b* kv_shape[concat_axis] + y]; + + auto in_idx = std::vector{static_cast(b_kv), static_cast(f), static_cast(y), static_cast(x)}; + auto out_idx = std::vector{static_cast(b), static_cast(f), static_cast(y), static_cast(x)}; + + cldnn::tensor in(cldnn::format::bfyx, in_idx, 0); + cldnn::tensor out(cldnn::format::bfyx, out_idx, 0); + + size_t out_offset = kv_out_mem->get_layout().get_linear_offset(out); + size_t in_offset = kv_layout.get_linear_offset(in); + + if (ov::element::Type(kv_layout.data_type).size() == 2) + copy_element(kv_in_ptr.data(), kv_out_ptr.data(), in_offset, out_offset); + else if (ov::element::Type(kv_layout.data_type).size() == 2) + copy_element(kv_in_ptr.data(), kv_out_ptr.data(), in_offset, out_offset); + } + } + } + } +} + +ov::SoPtr VariableStateIndirectKVCache::get_state() const { + auto kv_layout = m_hidden_states[0]->get_layout(); + auto bt_mem = m_hidden_states[1]->get_memory(); + if (kv_layout.get_partial_shape()[m_beam_axis].get_length() > 1 && bt_mem) { + auto kv_mem = m_hidden_states[0]->get_memory(); + auto tensor = m_context->create_host_tensor(m_hidden_states[0]->get_user_specified_type(), kv_layout.get_shape()); + + auto& engine = m_context->get_engine(); + auto tmp_mem = engine.allocate_memory(kv_layout, engine.get_lockable_preferred_memory_allocation_type(), false); + + rearrange_cache(kv_mem, bt_mem, tmp_mem, m_context->get_engine().get_service_stream(), m_concat_axis); + + convert_and_copy(tmp_mem, tensor._ptr.get(), m_context->get_engine().get_service_stream()); + + return tensor; + } else { + return m_hidden_states[0]->get_state(); + } +} + +void VariableStateIndirectKVCache::set_memory(const cldnn::memory::ptr& new_mem, const cldnn::layout& actual_layout) { + m_hidden_states[0]->set_memory(new_mem, actual_layout); +} + +void VariableStateIndirectKVCache::set_layout(const cldnn::layout& new_layout) { + m_hidden_states[0]->set_layout(new_layout); +} + +size_t VariableStateIndirectKVCache::get_actual_mem_size() const { + return m_hidden_states[0]->get_actual_mem_size(); +} + +ov::PartialShape VariableStateIndirectKVCache::get_beam_table_shape(const ov::PartialShape& kv_cache_shape) { + auto rank = kv_cache_shape.size(); + ov::PartialShape beam_table_shape(std::vector(rank, 1)); + beam_table_shape[m_beam_axis] = kv_cache_shape[m_beam_axis]; + beam_table_shape[m_concat_axis] = kv_cache_shape[m_concat_axis]; + return beam_table_shape; +} + +VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const { + return m_hidden_states[1]; +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index 884176b173b198..35f8e6b520ab99 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -6,6 +6,7 @@ #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/kv_cache.hpp" +#include "openvino/core/validation_util.hpp" namespace ov { namespace op { @@ -23,11 +24,17 @@ namespace { void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {2, 3}); auto inputs = p.GetInputInfo(op); - const auto prim = cldnn::kv_cache(layer_type_name_ID(op), + int64_t rank = op->get_input_partial_shape(0).size(); + auto prim = cldnn::kv_cache(layer_type_name_ID(op), inputs, op->get_variable()->get_info(), - op->get_concat_axis(), - op->get_gather_axis()); + ov::util::normalize(op->get_concat_axis(), rank), + ov::util::normalize(op->get_gather_axis(), rank), + op->get_indirect()); + + prim.num_outputs = op->get_output_size(); + prim.output_data_types = get_output_data_types(op); + prim.output_paddings = get_output_paddings(op); p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp index 2cea239ae4d9c8..1225ebe51b4027 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp @@ -11,6 +11,7 @@ #include "openvino/op/constant.hpp" #include "openvino/op/fake_quantize.hpp" #include "intel_gpu/op/gemm.hpp" +#include "intel_gpu/op/indirect_gemm.hpp" #include "intel_gpu/primitives/gemm.hpp" #include "intel_gpu/primitives/fully_connected.hpp" @@ -22,6 +23,7 @@ namespace ov { namespace op { namespace internal { using Gemm = ov::intel_gpu::op::Gemm; +using IndirectGemm = ov::intel_gpu::op::IndirectGemm; } // namespace internal } // namespace op } // namespace ov @@ -186,8 +188,32 @@ static void CreateGemmOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {3}); + auto inputs = p.GetInputInfo(op); + std::string layer_name = layer_type_name_ID(op); + + auto alpha = 1.0f; + auto beta = 0.0f; + + auto gemmPrim = cldnn::gemm(layer_name, + std::vector{ inputs[0], inputs[1] }, + inputs[2], + cldnn::element_type_to_data_type(op->get_output_element_type(0)), + op->get_input0_order(), + op->get_input1_order(), + op->get_output_order(), + op->get_indirect_a(), + op->get_indirect_b(), + alpha, + beta); + + p.add_primitive(*op, gemmPrim); +} + REGISTER_FACTORY_IMPL(v0, MatMul); REGISTER_FACTORY_IMPL(internal, Gemm); +REGISTER_FACTORY_IMPL(internal, IndirectGemm); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index dc4048b12d4a7d..e931746d6d92f9 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -104,7 +104,6 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + cloned_model->get_name(); - ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(cloned_model); ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); } @@ -125,7 +124,6 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + cloned_model->get_name() + "_" + "transformed_func"; - ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(cloned_model); ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); } return cloned_model; diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index ff28f12bb8c5d0..51861988f1ff23 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -2,20 +2,23 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "intel_gpu/plugin/usm_host_tensor.hpp" -#include "intel_gpu/runtime/memory.hpp" -#include "intel_gpu/runtime/memory_caps.hpp" #include "openvino/runtime/make_tensor.hpp" #include "openvino/core/preprocess/input_tensor_info.hpp" #include "openvino/core/parallel.hpp" +#include "openvino/core/validation_util.hpp" #include "openvino/op/util/op_types.hpp" #include "transformations/utils/utils.hpp" +#include "intel_gpu/primitives/kv_cache.hpp" +#include "intel_gpu/plugin/usm_host_tensor.hpp" #include "intel_gpu/plugin/sync_infer_request.hpp" #include "intel_gpu/plugin/remote_context.hpp" #include "intel_gpu/plugin/remote_tensor.hpp" #include "intel_gpu/plugin/compiled_model.hpp" #include "intel_gpu/plugin/variable_state.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" +#include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/runtime/memory_caps.hpp" #include "intel_gpu/runtime/internal_properties.hpp" #include "intel_gpu/runtime/itt.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" @@ -568,12 +571,28 @@ void SyncInferRequest::allocate_states() { const auto& network = m_graph->get_network(); const auto& variables_info = network->get_variables_info(); for (auto& vi : variables_info) { - auto variable = std::make_shared(vi.second, m_context, m_shape_predictor); - m_variables.emplace(vi.first, variable); + const auto& state_prims = vi.second.m_primitives; + bool indirect_kv_cache = false; + int64_t beam_axis = 0; + int64_t concat_axis = 0; + auto kv_cache_shape = vi.second.m_layout.get_partial_shape(); + for (auto& p : state_prims) { + if (auto kv_cache_prim = dynamic_cast(p)) { + indirect_kv_cache = kv_cache_prim->indirect; + beam_axis = ov::util::normalize(kv_cache_prim->gather_axis, kv_cache_shape.size()); + concat_axis = ov::util::normalize(kv_cache_prim->concat_axis, kv_cache_shape.size()); + } + } + + if (indirect_kv_cache) { + m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis)); + } else { + m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor)); + } } } -void SyncInferRequest::prepare_state(const std::string& name, const VariableState::Ptr variable) { +void SyncInferRequest::prepare_state(const std::string& name, const std::shared_ptr& variable) { m_graph->get_network()->set_variable(name, variable); } @@ -745,7 +764,7 @@ std::vector SyncInferRequest::prepare_input(const std::string } } - GPU_DEBUG_TRACE_DETAIL << name << " prepare input: " << memory->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << name << " prepare input: " << memory->buffer_ptr() << " alloc_type: " << memory->get_allocation_type() << std::endl; const cldnn::primitive_id internal_name = "parameter:" + name; network->set_input_data(internal_name, memory); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp new file mode 100644 index 00000000000000..2a0bac302956c2 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "indirect_kv_cache.hpp" +#include + +#include "intel_gpu/op/gemm.hpp" +#include "intel_gpu/op/indirect_gemm.hpp" +#include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "openvino/core/graph_util.hpp" +#include "openvino/core/node_vector.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/read_value.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "transformations/utils/utils.hpp" + +namespace { +// same impl as ov::replace node, but w/o outputs count check +void replace_node_unsafe(const std::shared_ptr& target, const std::shared_ptr& replacement) { + if (ov::op::util::is_output(target)) { + OPENVINO_THROW("Result nodes cannot be replaced."); + } + for (size_t i = 0; i < target->get_output_size(); i++) { + target->output(i).replace(replacement->output(0)); + } + + replacement->add_node_control_dependents(target); + replacement->add_node_control_dependencies(target); + target->clear_control_dependents(); +} + +} // namespace + +namespace ov { +namespace intel_gpu { + +IndirectKVCache::IndirectKVCache() { + using namespace ov::pass::pattern; + + auto beam_idx = wrap_type(); + auto gather_input = wrap_type(); + auto axis_const = wrap_type( + ov::op::util::constant_predicate([](const std::vector& value) -> bool { + return value.size() == 1 && value[0] == 0; + })); + auto gather_past = wrap_type({gather_input, beam_idx, axis_const}); + auto kv_cache = wrap_type({gather_past, any_input()}); + auto matmul_0 = wrap_type({kv_cache, any_input()}); + auto matmul_1 = wrap_type({any_input(), kv_cache}); + auto matmul = std::make_shared(OutputVector{matmul_0, matmul_1}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + const auto& pattern_map = m.get_pattern_value_map(); + + auto kv_cache_node = std::dynamic_pointer_cast(pattern_map.at(kv_cache).get_node_shared_ptr()); + + auto beam_idx_node = pattern_map.at(beam_idx).get_node_shared_ptr(); + auto gather_input_node = pattern_map.at(gather_input).get_node_shared_ptr(); + auto gather_node = std::dynamic_pointer_cast(pattern_map.at(gather_past).get_node_shared_ptr()); + ov::replace_node(gather_node, gather_input_node); + + auto indirect_kv_cache = std::make_shared(gather_input_node, + kv_cache_node->get_input_node_shared_ptr(1), + beam_idx_node, + kv_cache_node->get_variable(), + kv_cache_node->get_concat_axis(), + gather_node->get_axis(), + kv_cache_node->get_output_element_type(0)); + + indirect_kv_cache->set_friendly_name(kv_cache_node->get_friendly_name()); + ov::copy_runtime_info(kv_cache_node, indirect_kv_cache); + replace_node_unsafe(kv_cache_node, indirect_kv_cache); + + auto kv_cache_users = indirect_kv_cache->get_output_target_inputs(0); + auto matmul_kv_cache_index = kv_cache_users.begin()->get_index(); + + auto gemm_node = std::dynamic_pointer_cast(m.get_match_root()); + auto order_in0 = gemm_node->get_input0_order(); + auto order_in1 = gemm_node->get_input1_order(); + auto order_out = gemm_node->get_output_order(); + + auto indirect_gemm = std::make_shared(gemm_node->get_input_node_shared_ptr(0), + gemm_node->get_input_node_shared_ptr(1), + indirect_kv_cache->output(1), // beam table + matmul_kv_cache_index == 0, + matmul_kv_cache_index == 1, + order_in0, + order_in1, + order_out); + + indirect_gemm->set_friendly_name(gemm_node->get_friendly_name()); + ov::copy_runtime_info(gemm_node, indirect_gemm); + ov::replace_node(gemm_node, indirect_gemm); + + return true; + }; + + auto m = std::make_shared(matmul, "IndirectKVCache"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.hpp b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.hpp new file mode 100644 index 00000000000000..afea5da6ceb13c --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +/// Merges Gather into KVCache op +/// ┌─────────────────────┐ ┌─────────────────────┐ +/// │ SomeOp │ │ SomeOp │ +/// | (state initializer) | | (state initializer) | +/// └─────┬───────────────┘ └─────┬───────────────┘ +/// | | +/// ┌─────┴─────┐ ┌────────────┐ ┌───────────┐ ┌─────┴─────┐ ┌────────────┐ ┌───────────┐ +/// │ ReadValue │ │ Parameter │ │ SomeOp │ │ ReadValue │ │ Parameter │ │ SomeOp │ +/// | (past_kv) | | beam_idx | | new_token | | (past_kv) | | beam_idx | | new_token | +/// └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ +/// │ │ │ │ │ │ +/// │ │ │ | | ┌──────────┘ +/// ┌─────┴──────┐ | │ | | | +/// | Gather |───────────┘ │ | | | +/// └─────┬──────┘ │ | | | +/// | ┌──────────────────────────┘ | | | +/// | | | | | +/// │ | | | | +/// │ ┌─────┴───┐ ┌──────────┐ | ┌────┴──────┴───────┐ ┌──────────┐ +/// └───┤ KVCache |...............| Variable | => └────────────┤ KVCache |.......................| Variable | +/// └───┬─────┘ └──────────┘ └────┬──────┬───────┘ └──────────┘ +/// │ | | +/// | kv_cache | | beam_table +/// | | | +/// ┌────┴──────┐ ┌────┴──────┴───┐ +/// │ Gemm │ | IndirectGemm | +/// └───────────┘ └───────────────┘ +class IndirectKVCache : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("IndirectKVCache", "0"); + IndirectKVCache(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.cpp index 598bd2153c7667..b444252009fa9b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.cpp @@ -81,25 +81,11 @@ KVCacheFusionMatcher::KVCacheFusionMatcher() { ov::replace_node(past_node, new_read_value_node); if (pattern_map.count(gather_past) > 0) { - // TODO: Enable code below once KVCache custom op supports rearrange internally - // For now Gather is kept as standalone op - #if 0 - auto gather_node = pattern_map.at(gather_past).get_node_shared_ptr(); - auto gather_axis = std::dynamic_pointer_cast(gather_node->get_input_node_shared_ptr(2))->cast_vector()[0]; - kv_cache_node = std::make_shared(new_read_value_node, - concat_node->get_input_node_shared_ptr(1), - gather_node->get_input_node_shared_ptr(1), - variable, - concat_axis, - gather_axis, - concat_node->get_output_element_type(0)); - #else kv_cache_node = std::make_shared(pattern_map.at(gather_past).get_node_shared_ptr(), concat_node->get_input_node_shared_ptr(1), variable, concat_axis, new_read_value_node->get_output_element_type(0)); - #endif } else { kv_cache_node = std::make_shared(new_read_value_node, concat_node->get_input_node_shared_ptr(1), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.hpp index 4bb99bc7bdc4de..dbe147da8d46b7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_fusion.hpp @@ -27,86 +27,49 @@ namespace intel_gpu { /// └─────────────┘ └───────────┘ └─────────┘ /// 2. With gather for beam search (or model which supports both greedy and beam search) -/// ┌───────────┐ ┌────────────┐ ┌───────────┐ ┌───────────┐ ┌────────────┐ -/// │ ReadValue │ │ Parameter │ │ SomeOp │ │ SomeOp │ │ Parameter │ -/// | (past_kv) | | beam_idx | | new_token | | new_token | | beam_idx | -/// └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ └─────┬─────┘ └─────┬──────┘ -/// │ │ │ | | -/// ┌─────┴──────┐ | │ | | -/// | Gather |───────────┘ │ | | -/// └─────┬──────┘ │ | | -/// | │ | | -/// | │ | | -/// │ │ | | -/// │ ┌────────┐ │ | ┌────┴────┐ ┌──────────┐ -/// └───┤ Concat ├───────────────────────┘ => └────────────┤ KVCache |......| Variable | -/// └───┬────┘ └────┬────┘ └──────────┘ -/// │ | -/// ┌──────────┴────────────┐ | -/// ┌────┴────────┐ ┌────┴──────┐ ┌────┴────┐ -/// │ Assign │ │ SomeOp │ | SomeOp | -/// | (present_kv | | (SDPA) | | (SDPA) | -/// └─────────────┘ └───────────┘ └─────────┘ +/// ┌───────────┐ ┌────────────┐ ┌───────────┐ ┌───────────┐ ┌────────────┐ ┌───────────┐ +/// │ ReadValue │ │ Parameter │ │ SomeOp │ │ ReadValue │ │ Parameter │ │ SomeOp │ +/// | (past_kv) | | beam_idx | | new_token | | (past_kv) | | beam_idx | | new_token | +/// └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ +/// │ │ │ │ │ │ +/// ┌─────┴──────┐ | │ ┌─────┴──────┐ | │ +/// | Gather |───────────┘ │ | Gather |───────────┘ │ +/// └─────┬──────┘ │ └─────┬──────┘ │ +/// | │ | ┌───────────┘ +/// | │ | | +/// │ │ | | +/// │ ┌────────┐ │ | ┌───────────┴───────┐ ┌──────────┐ +/// └───┤ Concat ├───────────────────────┘ => └────────────┤ KVCache |.......................| Variable | +/// └───┬────┘ └────┬──────────────┘ └──────────┘ +/// │ | +/// ┌──────────┴─────────────┐ | +/// ┌────┴─────────┐ ┌────┴──────┐ ┌────┴────┐ +/// │ Assign │ │ SomeOp │ | SomeOp | +/// | (present_kv) | | (SDPA) | | (SDPA) | +/// └──────────────┘ └───────────┘ └─────────┘ /// 3. Similar to case 2, but with variable initializer -/// ┌─────────────────────┐ ┌─────────────────────┐ -/// │ SomeOp │ │ SomeOp │ -/// | (state initializer) | | (state initializer) | -/// └─────┬───────────────┘ └─────┬───────────────┘ -/// | | -/// ┌─────┴─────┐ ┌────────────┐ ┌───────────┐ ┌───────────┐ ┌────────────┐ | -/// │ ReadValue │ │ Parameter │ │ SomeOp │ │ SomeOp │ │ Parameter │ | -/// | (past_kv) | | beam_idx | | new_token | | new_token | | beam_idx | | -/// └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ └─────┬─────┘ └─────┬──────┘ | -/// │ │ │ | | ┌─────────┘ -/// ┌─────┴──────┐ | │ | | | -/// | Gather |───────────┘ │ | | | -/// └─────┬──────┘ │ | | | -/// | │ | | | -/// | │ | | | -/// │ │ | | | -/// │ ┌────────┐ │ | ┌────┴──────┴───────┐ ┌──────────┐ -/// └───┤ Concat ├───────────────────────┘ => └────────────┤ KVCache |.......................| Variable | -/// └───┬────┘ └────┬──────────────┘ └──────────┘ -/// │ | -/// ┌──────────┴────────────┐ | -/// ┌────┴────────┐ ┌────┴──────┐ ┌────┴────┐ -/// │ Assign │ │ SomeOp │ | SomeOp | -/// | (present_kv | | (SDPA) | | (SDPA) | -/// └─────────────┘ └───────────┘ └─────────┘ - -/// 4. Pattern may also include Convert for state variable -/// ┌─────────────────────┐ ┌─────────────────────┐ -/// │ SomeOp │ │ SomeOp │ -/// | (state initializer) | | (state initializer) | -/// └─────┬───────────────┘ └─────┬───────────────┘ -/// | | -/// ┌─────┴─────┐ ┌────────────┐ ┌───────────┐ ┌───────────┐ ┌────────────┐ | -/// │ ReadValue │ │ Parameter │ │ SomeOp │ │ SomeOp │ │ Parameter │ | -/// | (past_kv) | | beam_idx | | new_token | | new_token | | beam_idx | | -/// └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ └─────┬─────┘ └─────┬──────┘ | -/// │ │ │ | | ┌─────────┘ -/// ┌─────┴────┐ │ │ | | | -/// │ Convert │ │ │ | | | -/// └─────┬────┘ │ │ | | | -/// │ │ │ | | | -/// │ │ │ | | | -/// ┌─────┴──────┐ | │ | | | -/// | Gather |───────────┘ │ | | | -/// └─────┬──────┘ │ | | | -/// | │ | | | -/// | │ | | | -/// │ │ | | | -/// │ ┌────────┐ │ | ┌────┴──────┴───────┐ ┌──────────┐ +/// ┌─────────────────────┐ ┌─────────────────────┐ +/// │ SomeOp │ │ SomeOp │ +/// | (state initializer) | | (state initializer) | +/// └─────┬───────────────┘ └─────┬───────────────┘ +/// | | +/// ┌─────┴─────┐ ┌────────────┐ ┌───────────┐ ┌─────┴─────┐ ┌────────────┐ ┌───────────┐ +/// │ ReadValue │ │ Parameter │ │ SomeOp │ │ ReadValue │ │ Parameter │ │ SomeOp │ +/// | (past_kv) | | beam_idx | | new_token | | (past_kv) | | beam_idx | | new_token | +/// └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ └─────┬─────┘ └─────┬──────┘ └─────┬─────┘ +/// │ │ │ │ │ │ +/// ┌─────┴──────┐ | │ ┌─────┴──────┐ | │ +/// | Gather |───────────┘ │ | Gather |───────────┘ │ +/// └─────┬──────┘ │ └─────┬──────┘ │ +/// | │ | ┌───────────┘ +/// | │ | | +/// │ │ | | +/// │ ┌────────┐ │ | ┌───────────┴───────┐ ┌──────────┐ /// └───┤ Concat ├───────────────────────┘ => └────────────┤ KVCache |.......................| Variable | /// └───┬────┘ └────┬──────────────┘ └──────────┘ /// │ | /// ┌──────────┴────────────┐ | -/// | | | -/// ┌─────┴────┐ | | -/// │ Convert │ | | -/// └─────┬────┘ | | -/// | | | /// ┌────┴────────┐ ┌────┴──────┐ ┌────┴────┐ /// │ Assign │ │ SomeOp │ | SomeOp | /// | (present_kv | | (SDPA) | | (SDPA) | diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp new file mode 100644 index 00000000000000..5e35d5cd1fc177 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_gemm.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/indirect_gemm.hpp" +#include "openvino/core/partial_shape.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +IndirectGemm::IndirectGemm(const ov::Output& A, + const ov::Output& B, + const ov::Output& I, + bool indirect_a, + bool indirect_b, + const std::vector& order_a, + const std::vector& order_b, + const std::vector& order_c, + const ov::element::Type output_type) + : ov::intel_gpu::op::Gemm(A, B, order_a, order_b, order_c, output_type) + , m_indirect_a(indirect_a) + , m_indirect_b(indirect_b) { + set_argument(2, I); + OPENVINO_ASSERT((indirect_a && indirect_b) == false, "[GPU] Gemm supports indirect addressing for one input only"); + validate_and_infer_types(); +} + +std::shared_ptr IndirectGemm::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + m_indirect_a, + m_indirect_b, + m_order_a, + m_order_b, + m_order_c, + m_output_type); +} + +void IndirectGemm::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, + input_size == 3, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected 3."); + + auto out_shapes = shape_infer(this, std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}, m_order_a, m_order_b, m_order_c); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +bool IndirectGemm::visit_attributes(ov::AttributeVisitor &visitor) { + Gemm::visit_attributes(visitor); + visitor.on_attribute("indirect_a", m_indirect_a); + visitor.on_attribute("indirect_b", m_indirect_b); + return true; +} + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index 82f458c7d7fd79..46c227044a2ee5 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -5,6 +5,7 @@ #include "intel_gpu/op/kv_cache.hpp" #include "concat_shape_inference.hpp" #include "openvino/core/partial_shape.hpp" +#include "openvino/core/validation_util.hpp" #include "openvino/op/concat.hpp" namespace ov { @@ -18,8 +19,14 @@ KVCache::KVCache(const Output& past, int64_t concat_axis, int64_t gather_axis, const ov::element::Type output_type) - : Op({past, new_token_data, beam_idx}), m_concat_axis(concat_axis), m_gather_axis(gather_axis), m_output_type(output_type) { + : Op({past, new_token_data, beam_idx}) + , m_concat_axis(concat_axis) + , m_gather_axis(gather_axis) + , m_indirect(true) + , m_output_type(output_type) { m_variable = past_variable; + if (m_indirect) + set_output_size(2); validate_and_infer_types(); } @@ -28,7 +35,11 @@ KVCache::KVCache(const Output& past, const std::shared_ptr& past_variable, int64_t concat_axis, const ov::element::Type output_type) - : Op({past, new_token_data}), m_concat_axis(concat_axis), m_gather_axis(0), m_output_type(output_type) { + : Op({past, new_token_data}) + , m_concat_axis(concat_axis) + , m_gather_axis(0) + , m_indirect(false) + , m_output_type(output_type) { m_variable = past_variable; validate_and_infer_types(); } @@ -36,15 +47,19 @@ KVCache::KVCache(const Output& past, bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) { visitor.on_attribute("concat_axis", m_concat_axis); visitor.on_attribute("gather_axis", m_gather_axis); + visitor.on_attribute("indirect", m_indirect); visitor.on_attribute("output_type", m_output_type); return true; } void KVCache::validate_and_infer_types() { auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; - const std::vector input_shapes = {get_input_partial_shape(0), get_input_partial_shape(1)}; - - set_output_type(0, output_type, shape_infer(this, input_shapes)[0]); + std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; + auto shapes = shape_infer(this, input_shapes); + set_output_type(0, output_type, shapes[0]); + if (m_indirect) { + set_output_type(1, get_input_element_type(2), shapes[1]); + } } std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new_args) const { @@ -70,7 +85,19 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new std::vector shape_infer(const KVCache* op, std::vector input_shapes) { ov::op::v0::Concat concat; concat.set_axis(op->get_concat_axis()); - return ov::op::v0::shape_infer(&concat, input_shapes); + std::vector concat_shapes = {input_shapes[0], input_shapes[1]}; + auto out_shapes = ov::op::v0::shape_infer(&concat, concat_shapes); + + if (op->get_output_size() == 2) { + int64_t gather_axis = ov::util::normalize(op->get_gather_axis(), input_shapes[0].size()); + int64_t concat_axis = ov::util::normalize(op->get_concat_axis(), input_shapes[0].size()); + ov::PartialShape beam_table_shape(std::vector(out_shapes[0].size(), 1)); + beam_table_shape[gather_axis] = input_shapes[0][gather_axis]; + beam_table_shape[concat_axis] = out_shapes[0][concat_axis]; + out_shapes.push_back(beam_table_shape); + } + + return out_shapes; } } // namespace op diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp index db753d8251d6c9..f0bd80dab3bbfd 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_matmul_fusion.cpp @@ -25,14 +25,6 @@ using ov::pass::pattern::op::Or; namespace ov { namespace intel_gpu { -namespace { -std::vector default_order(size_t rank) { - std::vector order(rank); - std::iota(order.begin(), order.end(), 0); - return order; -} -} // namespace - class TransposeMatMulMatcher : public ov::pass::MatcherPass { public: OPENVINO_RTTI("TransposeMatMulMatcher", "0"); @@ -96,9 +88,9 @@ TransposeMatMulMatcher::TransposeMatMulMatcher() { return false; } - auto order_a = default_order(matmul->get_input_partial_shape(0).size()); - auto order_b = default_order(matmul->get_input_partial_shape(1).size()); - auto order_c = default_order(matmul->get_output_partial_shape(0).size()); + auto order_a = op::Gemm::default_order(matmul->get_input_partial_shape(0).size()); + auto order_b = op::Gemm::default_order(matmul->get_input_partial_shape(1).size()); + auto order_c = op::Gemm::default_order(matmul->get_output_partial_shape(0).size()); size_t input_a_output_idx = matmul->get_input_source_output(0).get_index(); size_t input_b_output_idx = matmul->get_input_source_output(1).get_index(); @@ -178,8 +170,8 @@ TransposeMatMulTransposeMatcher::TransposeMatMulTransposeMatcher() { } auto tranpose_c_order = std::dynamic_pointer_cast(pattern_map.at(transpose_c_order_m).get_node_shared_ptr()); - auto order_a = default_order(matmul->get_input_partial_shape(0).size()); - auto order_b = default_order(matmul->get_input_partial_shape(1).size()); + auto order_a = op::Gemm::default_order(matmul->get_input_partial_shape(0).size()); + auto order_b = op::Gemm::default_order(matmul->get_input_partial_shape(1).size()); auto order_c = tranpose_c_order->cast_vector(); size_t input_a_output_idx = matmul->get_input_source_output(0).get_index(); size_t input_b_output_idx = matmul->get_input_source_output(1).get_index(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 0c6f174dc69832..6e849f87fe6cad 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -59,6 +59,7 @@ #include "plugin/transformations/rms_fusion.hpp" #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_matmul_fusion.hpp" +#include "plugin/transformations/indirect_kv_cache.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" #include "transformations/common_optimizations/broadcast_transition.hpp" #include "transformations/common_optimizations/common_optimizations.hpp" @@ -709,6 +710,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + manager.register_pass(); // This is supposed to be the last pass to ensure that we don't have name collisions until // GPU plugin stops using friendly names for program creation manager.register_pass(true); diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp index e2728eb1b0dea0..8bd19f06cbd820 100644 --- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp @@ -18,10 +18,9 @@ namespace ov { namespace intel_gpu { VariableState::VariableState(const VariableStateInfo& info, RemoteContextImpl::Ptr context, std::shared_ptr shape_predictor) - : ov::IVariableState {info.m_id} + : VariableStateBase{info.m_id, context} , m_layout(info.m_layout) , m_user_specified_type(info.m_user_specified_type) - , m_context(context) , m_shape_predictor(shape_predictor) , m_initial_layout(info.m_layout) { update_device_buffer(); @@ -40,13 +39,6 @@ const cldnn::layout& VariableState::get_layout() const { return m_layout; } -bool VariableState::is_set() const { - return m_is_set; -} -void VariableState::set() { - m_is_set = true; -} - void VariableState::set_memory(const cldnn::memory::ptr& new_mem, const cldnn::layout& actual_layout) { GPU_DEBUG_TRACE_DETAIL << m_name << " : Update memory (Ptr : " << new_mem->buffer_ptr() << ", layout : " << actual_layout.to_short_string() << ")" << std::endl; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp index e89201de456218..9f60bcce497911 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp @@ -265,7 +265,7 @@ void ocl_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args auto& kern = ocl_kernel.get_handle(); try { - GPU_DEBUG_TRACE_DETAIL << "Set arguments for primitive: " << args_desc.layerID << " (" << kern.get() << ")\n"; + GPU_DEBUG_TRACE_DETAIL << "Set arguments for primitive: " << args_desc.layerID << " (" << kernel.get_id() << " = " << kern.get() << ")\n"; set_arguments_impl(kern, args_desc.arguments, args); } catch (cl::Error const& err) { OPENVINO_THROW(OCL_ERR_MSG_FMT(err)); diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt index b2e745081a32c1..bb1e0cab52bac1 100644 --- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt @@ -21,6 +21,7 @@ file(GLOB_RECURSE SOURCES_MAIN "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/transformations/*.hpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/transformations/*.cpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/variable_state.cpp" + "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/remote_context.cpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp" "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/usm_host_tensor.cpp" diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp index 41178a311c0ee9..f1473170630798 100644 --- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp +++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp @@ -206,7 +206,7 @@ TEST(stateful_model, check_dynamic_pad_for_kv_cache) { ov::Shape{}, // output shape 0, // batch_dim true), // support_neg_ind - kv_cache("concat", {input_info("gather"), input_info("present")}, info, 0, 0), + kv_cache("concat", {input_info("gather"), input_info("present")}, info, 0, 0, false), reorder("reorder", input_info("concat"), format::bfyx, data_types::f32)); /*output padding*/ ExecutionConfig config = get_test_default_config(engine); diff --git a/src/plugins/intel_gpu/tests/unit/shape_infer/kv_cache_si_test.cpp b/src/plugins/intel_gpu/tests/unit/shape_infer/kv_cache_si_test.cpp index ef34f278d1a4d6..bb4985a01cdf7f 100644 --- a/src/plugins/intel_gpu/tests/unit/shape_infer/kv_cache_si_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/shape_infer/kv_cache_si_test.cpp @@ -24,6 +24,7 @@ struct kv_cache_test_params { std::vector input_layouts; int64_t concat_axis; int64_t gather_axis; + bool indirect; layout expected_layout; }; @@ -46,7 +47,7 @@ TEST_P(kv_cache_test, shape_infer) { ov::op::util::VariableInfo info{p.input_layouts[0].get_partial_shape(), p.input_layouts[0].data_type, "v0"}; - auto kv_cache_prim = std::make_shared("output", input_prims_ids, info, p.concat_axis, p.gather_axis); + auto kv_cache_prim = std::make_shared("output", input_prims_ids, info, p.concat_axis, p.gather_axis, p.indirect); auto& kv_cache_node = prog.get_or_create(kv_cache_prim); for (size_t i = 0; i < p.input_layouts.size(); i++) { auto& input_node = prog.get_or_create(input_prims[i]); @@ -69,6 +70,7 @@ INSTANTIATE_TEST_SUITE_P(smoke, kv_cache_test, }, 2, 0, + false, layout{ov::PartialShape{-1, 2, -1, 4}, data_types::f32, format::bfyx} }, { @@ -78,6 +80,7 @@ INSTANTIATE_TEST_SUITE_P(smoke, kv_cache_test, }, 2, 0, + false, layout{ov::PartialShape{1, 2, 10, 4}, data_types::f16, format::bfyx} }, })); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index fa56c83db19414..f9b48d85c6c083 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -692,6 +692,151 @@ class gemm_gpu_tests: public ::testing::Test { } } + void test_transpose_indirect(bool is_caching_test, bool indirect_input0 = false, bool indirect_input1 = false) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); + + const unsigned long BATCH_SIZE = 19; + const unsigned long M_SIZE = 37; + const unsigned long K_SIZE = 23; + const unsigned long N_SIZE = 29; + + auto fill_mem = [&](cldnn::memory_ptr mem, std::vector& data) { + cldnn::mem_lock mem_ptr(mem, get_test_stream()); + auto&& l = mem->get_layout(); + auto data_idx = 0; + for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { + for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) { + for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { + for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { + auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0}; + auto buffer_idx = l.get_linear_offset(tensor_coord); + mem_ptr[buffer_idx] = data[data_idx++]; + } + } + } + } + }; + + auto& engine = get_test_engine(); + ov::Shape beam_table_shape; + + ov::Shape input0_shape = { BATCH_SIZE, K_SIZE, 1, M_SIZE }; + ov::Shape input1_shape = { BATCH_SIZE, 1, N_SIZE, K_SIZE }; + std::vector input0_order = { 0, 2, 3, 1 }; + std::vector input1_order = { 0, 1, 3, 2 }; + if (indirect_input0) + beam_table_shape = { BATCH_SIZE, K_SIZE, 1, 1 }; + else if (indirect_input1) + beam_table_shape = { BATCH_SIZE, 1, 1, K_SIZE }; + + cldnn::layout input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f32, format::bfyx}; + cldnn::layout input1_layout = layout{ov::PartialShape::dynamic(input1_shape.size()), data_types::f32, format::bfyx}; + + auto beam_table_layout = layout{ov::PartialShape::dynamic(beam_table_shape.size()), data_types::i32, format::bfyx}; + auto input0_mem = engine.allocate_memory(layout{ov::PartialShape(input0_shape), data_types::f32, format::bfyx}); + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(input1_shape), data_types::f32, format::bfyx}); + auto beam_table_mem = engine.allocate_memory(layout{ov::PartialShape(beam_table_shape), data_types::i32, format::bfyx}); + + auto input_0_data = rg.generate_random_1d(ov::shape_size(input0_shape), -2, 2); + auto input_1_data = rg.generate_random_1d(ov::shape_size(input1_shape), -2, 2); + auto beam_table_data = rg.generate_random_1d(ov::shape_size(beam_table_shape), 0, BATCH_SIZE - 1, 1); + + fill_mem(input0_mem, input_0_data); + fill_mem(input1_mem, input_1_data); + set_values(beam_table_mem, beam_table_data); + + topology topology; + topology.add(input_layout("input0", input0_layout), + input_layout("input1", input1_layout), + input_layout("beam_table", beam_table_layout), + gemm("gemm", { input_info("input0"), input_info("input1") }, input_info("beam_table"), data_types::f32, input0_order, input1_order, {}, indirect_input0, indirect_input1) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input0", input0_mem); + network->set_input_data("input1", input1_mem); + network->set_input_data("beam_table", beam_table_mem); + + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + + auto outputs = network->execute(); + + auto output_mem = outputs.at("gemm").get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + ov::Shape ref_input0_shape = { BATCH_SIZE, 1, M_SIZE, K_SIZE }; + ov::Shape ref_input1_shape = { BATCH_SIZE, 1, K_SIZE, N_SIZE }; + ov::Shape ref_output_shape = { BATCH_SIZE, 1, M_SIZE, N_SIZE }; + + std::vector ref_out_data; + ref_out_data.resize(ov::shape_size(ref_output_shape)); + + std::vector ref_input_0_data(input_0_data.size()); + std::vector ref_input_1_data(input_1_data.size()); + + ov::reference::transpose((const char *)(input_0_data.data()), + (char *)(ref_input_0_data.data()), + input0_shape, + sizeof(float), + input0_order, + ref_input0_shape); + + ov::reference::transpose((const char *)(input_1_data.data()), + (char *)(ref_input_1_data.data()), + input1_shape, + sizeof(float), + input1_order, + ref_input1_shape); + + if (indirect_input0) { + std::vector ref_input_0_data_tmp = ref_input_0_data; + const size_t b_pitch = M_SIZE * K_SIZE; + for (size_t b = 0; b < BATCH_SIZE; b++) { + for (size_t m = 0; m < M_SIZE; m++) { + for (size_t k = 0; k < K_SIZE; k++) { + const size_t b_new = beam_table_data[b * K_SIZE + k]; + ref_input_0_data[b * b_pitch + m * K_SIZE + k] = ref_input_0_data_tmp[b_new * b_pitch + m * K_SIZE + k]; + } + } + } + } + + if (indirect_input1) { + std::vector ref_input_1_data_tmp = ref_input_1_data; + const size_t b_pitch = N_SIZE * K_SIZE; + for (size_t b = 0; b < BATCH_SIZE; b++) { + for (size_t k = 0; k < K_SIZE; k++) { + for (size_t n = 0; n < N_SIZE; n++) { + const size_t b_new = beam_table_data[b * K_SIZE + k]; + ref_input_1_data[b * b_pitch + k * N_SIZE + n] = ref_input_1_data_tmp[b_new * b_pitch + k * N_SIZE + n]; + } + } + } + } + + ov::reference::matmul(ref_input_0_data.data(), + ref_input_1_data.data(), + ref_out_data.data(), + ref_input0_shape, + ref_input1_shape, + ref_output_shape, + false, + false); + + ASSERT_EQ(output_ptr.size(), ref_out_data.size()); + + const auto abs_error = 0.0001; + for (uint32_t i = 0; i < ref_out_data.size(); ++i) { + ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i; + } + } + void test_transpose_matmul(size_t num_dims, bool is_input_dynamic, bool is_caching_test) { tests::random_generator rg; rg.set_seed(GET_SUITE_NAME); @@ -723,6 +868,7 @@ class gemm_gpu_tests: public ::testing::Test { ov::Shape input1_shape; std::vector input0_order; std::vector input1_order; + ov::Shape beam_table_shape; cldnn::layout input0_layout; cldnn::layout input1_layout; @@ -842,7 +988,7 @@ class gemm_gpu_tests: public ::testing::Test { const auto abs_error = 0.0001; for (uint32_t i = 0; i < ref_out_data.size(); ++i) { - ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error); + ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i; } } @@ -914,7 +1060,6 @@ class gemm_gpu_tests: public ::testing::Test { input0_layout = layout{ov::PartialShape(input0_shape), data_types::f16, format::bfyx}; input1_layout = layout{ov::PartialShape(input1_shape), data_types::f16, format::bfyx}; } - auto input0_mem = engine.allocate_memory(layout{ov::PartialShape(input0_shape), data_types::f16, format::bfyx}); auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(input1_shape), data_types::f16, format::bfyx}); @@ -1072,6 +1217,14 @@ TEST_F(gemm_gpu_tests, transpose_matmul_static_4d) { this->test_transpose_matmul(4, false, false); } +TEST_F(gemm_gpu_tests, transpose_matmul_in0_indirect) { + this->test_transpose_indirect(false, true, false); +} + +TEST_F(gemm_gpu_tests, transpose_matmul_in1_indirect) { + this->test_transpose_indirect(false, false, true); +} + TEST_F(gemm_gpu_tests, transpose_matmul_transpose_dynamic_1d) { this->test_transpose_matmul_transpose(1, true, false); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp index 1e3917e16fc0a4..48d596ab3ba0e9 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp @@ -127,8 +127,8 @@ class check_hash_value: public ::testing::Test { const auto primitive_hash = primitve->hash(); const auto params_hash = prim_inst->get_impl_params()->hash(); - ASSERT_EQ(primitive_hash, 6333308204192016515UL); - ASSERT_EQ(params_hash, 5512364123521496254UL); + ASSERT_EQ(primitive_hash, 15839977233203008631UL); + ASSERT_EQ(params_hash, 15375157605915685928UL); } void test_permute_basic(bool is_caching_test) { diff --git a/src/plugins/intel_gpu/tests/unit/transformations/indirect_kv_cache_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/indirect_kv_cache_test.cpp new file mode 100644 index 00000000000000..276d11662f3b5b --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/indirect_kv_cache_test.cpp @@ -0,0 +1,123 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "common_test_utils/graph_comparator.hpp" +#include "common_test_utils/ov_test_utils.hpp" + +#include +#include + +#include "openvino/core/model.hpp" +#include "openvino/core/node_vector.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" +#include "openvino/pass/manager.hpp" + +#include +#include "plugin/transformations/indirect_kv_cache.hpp" + +#include "intel_gpu/op/indirect_gemm.hpp" +#include "intel_gpu/op/gemm.hpp" +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/kv_cache.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +TEST_F(TransformationTestsF, IndirectKVCache1) { + std::vector in0_order = {0, 1, 2, 3}; + std::vector in1_order = {0, 1, 3, 2}; + std::vector out_order = {0, 1, 2, 3}; + { + auto variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f32, "v0"}); + auto new_token_param = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, 80}); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + auto past = std::make_shared(variable); + auto axis = std::make_shared(ov::element::i64, ov::Shape{}, 0); + auto gather_past = std::make_shared(past, beam_idx, axis); + auto kv_cache = std::make_shared(gather_past, new_token_param, variable, 2, ov::element::f32); + auto gemm_in = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, -1}); + auto gemm = std::make_shared(gemm_in, kv_cache, in0_order, in1_order, out_order); + auto result = std::make_shared(gemm); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{new_token_param, beam_idx, gemm_in}); + manager.register_pass(); + } + { + auto variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f32, "v0"}); + auto parameter = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, 80}); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + auto past = std::make_shared(variable); + auto kv_cache = std::make_shared(past, parameter, beam_idx, variable, 2, 0, ov::element::f32); + auto gemm_in = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, -1}); + auto gemm = std::make_shared(gemm_in, kv_cache->output(0), kv_cache->output(1), false, true, + in0_order, in1_order, out_order); + auto result = std::make_shared(gemm); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{parameter, beam_idx, gemm_in}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, IndirectKVCache2) { + std::vector in0_order = {0, 1, 2, 3}; + std::vector in1_order = {0, 1, 3, 2}; + std::vector out_order = {0, 1, 2, 3}; + { + auto variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f32, "v0"}); + auto new_token_param = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, 80}); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + auto past = std::make_shared(variable); + auto axis = std::make_shared(ov::element::i64, ov::Shape{}, 0); + auto gather_past = std::make_shared(past, beam_idx, axis); + auto kv_cache = std::make_shared(gather_past, new_token_param, variable, 2, ov::element::f32); + auto gemm_in = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, -1}); + auto gemm = std::make_shared(kv_cache, gemm_in, in0_order, in1_order, out_order); + auto result = std::make_shared(gemm); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{new_token_param, beam_idx, gemm_in}); + manager.register_pass(); + } + { + auto variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f32, "v0"}); + auto parameter = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, 80}); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + auto past = std::make_shared(variable); + auto kv_cache = std::make_shared(past, parameter, beam_idx, variable, 2, 0, ov::element::f32); + auto gemm_in = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, -1}); + auto gemm = std::make_shared(kv_cache->output(0), gemm_in, kv_cache->output(1), true, false, + in0_order, in1_order, out_order); + auto result = std::make_shared(gemm); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{parameter, beam_idx, gemm_in}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, IndirectKVCache3) { + std::vector in0_order = {0, 1, 2, 3}; + std::vector in1_order = {0, 1, 3, 2}; + std::vector out_order = {0, 1, 2, 3}; + { + auto variable = std::make_shared(ov::op::util::VariableInfo{{1, 32, -1, 80}, ov::element::f32, "v0"}); + auto new_token_param = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, 80}); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape{1}); + auto past = std::make_shared(variable); + auto axis = std::make_shared(ov::element::i64, ov::Shape{}, 1); + auto gather_past = std::make_shared(past, beam_idx, axis); + auto kv_cache = std::make_shared(gather_past, new_token_param, variable, 2, ov::element::f32); + auto gemm_in = std::make_shared(ov::element::f32, ov::PartialShape{1, 32, -1, -1}); + auto gemm = std::make_shared(gemm_in, kv_cache, in0_order, in1_order, out_order); + auto result = std::make_shared(gemm); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{new_token_param, beam_idx, gemm_in}); + manager.register_pass(); + } + { + model_ref = model->clone(); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +}