Skip to content
65 changes: 49 additions & 16 deletions onnxruntime/core/providers/webgpu/webgpu_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,47 @@
}

SplitKConfig::SplitKConfig(const wgpu::AdapterInfo& adapter_info) {
if (adapter_info.vendor == std::string_view{"intel"}) {
if (adapter_info.architecture == std::string_view{"xe-2lpg"} ||
adapter_info.architecture == std::string_view{"xe-2hpg"} ||
adapter_info.architecture == std::string_view{"xe-lpg"} ||
adapter_info.architecture == std::string_view{"gen-12hp"}) {
if (adapter_info.vendor == std::string_view{"intel"})

Check warning on line 29 in onnxruntime/core/providers/webgpu/webgpu_utils.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 If/else bodies with multiple statements require braces [readability/braces] [4] Raw Output: onnxruntime/core/providers/webgpu/webgpu_utils.cc:29: If/else bodies with multiple statements require braces [readability/braces] [4]
// Disable Split-K on old Intel GPUs.
if (adapter_info.architecture == std::string_view{"gen-7"} ||
adapter_info.architecture == std::string_view{"gen-8"} ||
adapter_info.architecture == std::string_view{"gen-9"} ||
adapter_info.architecture == std::string_view{"gen-11"}) {
enable_split_k_ = false;
} else if (adapter_info.architecture == std::string_view{"xe-2lpg"} ||
adapter_info.architecture == std::string_view{"xe-2hpg"} ||
adapter_info.architecture == std::string_view{"gen-12hp"}) {
// Below thresholds are only verified on Intel discreate GPUs and Lunar Lake iGPUs.
enable_split_k_ = true;

// Below thresholds are only verified on the above Intel GPUs without any regressions. The
// proper value of `max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_` may be
// reduced when we support a larger `dim_inner` because larger `dim_inner` will bring more
// atomic calls for each output value.
split_dim_inner_ = 256;
min_dim_inner_with_split_k_ = split_dim_inner_ * 2;
max_dim_inner_with_split_k_ = split_dim_inner_ * 9;
max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_ = 35.0f;

configs_per_dim_inner_range_.emplace_back(768, 52.0f);
configs_per_dim_inner_range_.emplace_back(2304, 35.0f);
configs_per_dim_inner_range_.emplace_back(3072, 21.5f);
configs_per_dim_inner_range_.emplace_back(4096, 16.0f);
} else {
// Below are the default thresholds on newer Intel GPUs. These values are chosen on
// Intel "gen-12lp" GPU with 32EUs.
enable_split_k_ = true;

split_dim_inner_ = 256;
min_dim_inner_with_split_k_ = split_dim_inner_ * 2;

configs_per_dim_inner_range_.emplace_back(768, 20.0f);
configs_per_dim_inner_range_.emplace_back(1792, 13.0f);
configs_per_dim_inner_range_.emplace_back(3072, 8.0f);
configs_per_dim_inner_range_.emplace_back(4096, 6.0f);
}
}
}

SplitKConfig::ConfigAtRange::ConfigAtRange(uint32_t max_dim_inner, float rate)
: max_dim_inner_with_rate(max_dim_inner), max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner(rate) {}

uint32_t SplitKConfig::GetMaxDimInnerWithSplitK() const {
assert(!configs_per_dim_inner_range_.empty());
return configs_per_dim_inner_range_.back().max_dim_inner_with_rate;
}

bool SplitKConfig::UseSplitK(
Expand Down Expand Up @@ -71,11 +95,20 @@
// Split-K works best when `dim_inner` is relatively large compared with `dim_a_outer` and
// `dim_b_outer`. Currently we use the factor between `(dim_a_outer * dim_b_outer)` and
// `dim_inner)` as the metric to decide whether to use Split-K or not.
use_split_k &= (dim_inner >= min_dim_inner_with_split_k_);
use_split_k &= (dim_inner <= max_dim_inner_with_split_k_);
use_split_k &= ((dim_a_outer * dim_b_outer * 1.0f / dim_inner) <= max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_);
use_split_k &= dim_inner >= min_dim_inner_with_split_k_;
use_split_k &= dim_inner <= GetMaxDimInnerWithSplitK();

return use_split_k;
if (!use_split_k) {
return false;
}

const float rate = dim_a_outer * dim_b_outer * 1.0f / dim_inner;
for (const auto& config_at_range : configs_per_dim_inner_range_) {
if (dim_inner <= config_at_range.max_dim_inner_with_rate) {
return rate <= config_at_range.max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner;
}
}
return false;
}

uint32_t SplitKConfig::GetSplitDimInner() const {
Expand Down
11 changes: 9 additions & 2 deletions onnxruntime/core/providers/webgpu/webgpu_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,15 @@
bool enable_split_k_ = false;
uint32_t split_dim_inner_ = 0;
uint32_t min_dim_inner_with_split_k_ = 0;
uint32_t max_dim_inner_with_split_k_ = 0;
float max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner_ = 0.0f;

uint32_t GetMaxDimInnerWithSplitK() const;

struct ConfigAtRange {
ConfigAtRange(uint32_t max_dim_inner, float rate);
uint32_t max_dim_inner_with_rate = 0;
float max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner = 0.0f;
};
std::vector<ConfigAtRange> configs_per_dim_inner_range_;

Check warning on line 127 in onnxruntime/core/providers/webgpu/webgpu_utils.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/webgpu_utils.h:127: Add #include <vector> for vector<> [build/include_what_you_use] [4]
};

/**
Expand Down
Loading