diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.cc b/onnxruntime/core/providers/webgpu/webgpu_utils.cc index 4386bdcc94056..e48c8c833c7e3 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.cc @@ -27,24 +27,49 @@ TensorShape ReduceShapeByComponents(const TensorShape& shape, int64_t components SplitKConfig::SplitKConfig(const wgpu::AdapterInfo& adapter_info) { if (adapter_info.vendor == std::string_view{"intel"}) { - if (adapter_info.architecture == std::string_view{"xe-2lpg"} || - adapter_info.architecture == std::string_view{"xe-2hpg"} || - adapter_info.architecture == std::string_view{"xe-lpg"} || - adapter_info.architecture == std::string_view{"gen-12hp"}) { + // Disable Split-K on old Intel GPUs. + if (adapter_info.architecture == std::string_view{"gen-7"} || + adapter_info.architecture == std::string_view{"gen-8"} || + adapter_info.architecture == std::string_view{"gen-9"} || + adapter_info.architecture == std::string_view{"gen-11"}) { + enable_split_k_ = false; + } else if (adapter_info.architecture == std::string_view{"xe-2lpg"} || + adapter_info.architecture == std::string_view{"xe-2hpg"} || + adapter_info.architecture == std::string_view{"gen-12hp"}) { + // Below thresholds are only verified on Intel discreate GPUs and Lunar Lake iGPUs. enable_split_k_ = true; - // Below thresholds are only verified on the above Intel GPUs without any regressions. The - // proper value of `max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_` may be - // reduced when we support a larger `dim_inner` because larger `dim_inner` will bring more - // atomic calls for each output value. split_dim_inner_ = 256; min_dim_inner_with_split_k_ = split_dim_inner_ * 2; - max_dim_inner_with_split_k_ = split_dim_inner_ * 9; - max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_ = 35.0f; + + configs_per_dim_inner_range_.emplace_back(768, 52.0f); + configs_per_dim_inner_range_.emplace_back(2304, 35.0f); + configs_per_dim_inner_range_.emplace_back(3072, 21.5f); + configs_per_dim_inner_range_.emplace_back(4096, 16.0f); + } else { + // Below are the default thresholds on newer Intel GPUs. These values are chosen on + // Intel "gen-12lp" GPU with 32EUs. + enable_split_k_ = true; + + split_dim_inner_ = 256; + min_dim_inner_with_split_k_ = split_dim_inner_ * 2; + + configs_per_dim_inner_range_.emplace_back(768, 20.0f); + configs_per_dim_inner_range_.emplace_back(1792, 13.0f); + configs_per_dim_inner_range_.emplace_back(3072, 8.0f); + configs_per_dim_inner_range_.emplace_back(4096, 6.0f); } } } +SplitKConfig::ConfigAtRange::ConfigAtRange(uint32_t max_dim_inner, float rate) + : max_dim_inner_with_rate(max_dim_inner), max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner(rate) {} + +uint32_t SplitKConfig::GetMaxDimInnerWithSplitK() const { + assert(!configs_per_dim_inner_range_.empty()); + return configs_per_dim_inner_range_.back().max_dim_inner_with_rate; +} + bool SplitKConfig::UseSplitK( bool is_vec4, ActivationKind activation_kind, @@ -71,11 +96,20 @@ bool SplitKConfig::UseSplitK( // Split-K works best when `dim_inner` is relatively large compared with `dim_a_outer` and // `dim_b_outer`. Currently we use the factor between `(dim_a_outer * dim_b_outer)` and // `dim_inner)` as the metric to decide whether to use Split-K or not. - use_split_k &= (dim_inner >= min_dim_inner_with_split_k_); - use_split_k &= (dim_inner <= max_dim_inner_with_split_k_); - use_split_k &= ((dim_a_outer * dim_b_outer * 1.0f / dim_inner) <= max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_); + use_split_k &= dim_inner >= min_dim_inner_with_split_k_; + use_split_k &= dim_inner <= GetMaxDimInnerWithSplitK(); - return use_split_k; + if (!use_split_k) { + return false; + } + + const float rate = dim_a_outer * dim_b_outer * 1.0f / dim_inner; + for (const auto& config_at_range : configs_per_dim_inner_range_) { + if (dim_inner <= config_at_range.max_dim_inner_with_rate) { + return rate <= config_at_range.max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner; + } + } + return false; } uint32_t SplitKConfig::GetSplitDimInner() const { diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h index d56ee17504c24..6c72fd07938d5 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.h +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h @@ -116,8 +116,15 @@ class SplitKConfig { bool enable_split_k_ = false; uint32_t split_dim_inner_ = 0; uint32_t min_dim_inner_with_split_k_ = 0; - uint32_t max_dim_inner_with_split_k_ = 0; - float max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_ = 0.0f; + + uint32_t GetMaxDimInnerWithSplitK() const; + + struct ConfigAtRange { + ConfigAtRange(uint32_t max_dim_inner, float rate); + uint32_t max_dim_inner_with_rate = 0; + float max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner = 0.0f; + }; + std::vector configs_per_dim_inner_range_; }; /**