diff --git a/src/layer/vulkan/reduction_vulkan.cpp b/src/layer/vulkan/reduction_vulkan.cpp new file mode 100644 index 000000000000..91caaad76e22 --- /dev/null +++ b/src/layer/vulkan/reduction_vulkan.cpp @@ -0,0 +1,371 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "reduction_vulkan.h" + +#include + +#include "layer_shader_type.h" + +namespace ncnn { + +Reduction_vulkan::Reduction_vulkan() +{ + support_vulkan = true; + support_vulkan_packing = false; + support_vulkan_any_packing = false; + + pipeline_reduction = 0; +} + +static inline int axis_size_from_vkmat(int axis, int dims, const VkMat& m) +{ + if (dims == 1) + return axis == 3 ? m.w : 1; + + if (dims == 2) + { + if (axis == 2) return m.h; + if (axis == 3) return m.w; + return 1; + } + + if (dims == 3) + { + if (axis == 0) return m.c; + if (axis == 2) return m.h; + if (axis == 3) return m.w; + return 1; + } + + if (axis == 0) return m.c; + if (axis == 1) return m.d; + if (axis == 2) return m.h; + if (axis == 3) return m.w; + return 1; +} + +static inline void resolve_reduce_flags(int dims, int reduce_all, const Mat& axes, + bool& reduce_w, bool& reduce_h, bool& reduce_d, bool& reduce_c) +{ + reduce_w = false; + reduce_h = false; + reduce_d = false; + reduce_c = false; + + if (reduce_all) + { + reduce_w = true; + reduce_h = true; + reduce_d = true; + reduce_c = true; + return; + } + + int axes_flag[4] = {0, 0, 0, 0}; + const int* axes_ptr = axes; + const int axes_count = axes.w; + + for (int i = 0; i < axes_count; i++) + { + int axis = axes_ptr[i]; + if (axis < 0) axis += dims; + if (axis >= 0 && axis < 4) axes_flag[axis] = 1; + } + + if (dims == 1) + { + reduce_w = true; + } + else if (dims == 2) + { + if (axes_flag[0]) reduce_h = true; + if (axes_flag[1]) reduce_w = true; + } + else if (dims == 3) + { + if (axes_flag[0]) reduce_c = true; + if (axes_flag[1]) reduce_h = true; + if (axes_flag[2]) reduce_w = true; + } + else + { + if (axes_flag[0]) reduce_c = true; + if (axes_flag[1]) reduce_d = true; + if (axes_flag[2]) reduce_h = true; + if (axes_flag[3]) reduce_w = true; + } +} + +static inline void resolve_output_shape_and_mapping(const VkMat& bottom_blob, + bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c, + int keepdims, + int& outdims, int& out_w, int& out_h, int& out_d, int& out_c, + int& map_out_w, int& map_out_h, int& map_out_d, int& map_out_c) +{ + const int dims = bottom_blob.dims; + + outdims = 1; + out_w = 1; + out_h = 1; + out_d = 1; + out_c = 1; + + map_out_w = -1; + map_out_h = -1; + map_out_d = -1; + map_out_c = -1; + + auto is_reduced_axis = [&](int axis) -> bool { + if (axis == 0) return reduce_c; + if (axis == 1) return reduce_d; + if (axis == 2) return reduce_h; + if (axis == 3) return reduce_w; + return false; + }; + + int in_axes[4]; + int in_axes_count = 0; + if (dims == 1) + { + in_axes[0] = 3; + in_axes_count = 1; + } + else if (dims == 2) + { + in_axes[0] = 2; + in_axes[1] = 3; + in_axes_count = 2; + } + else if (dims == 3) + { + in_axes[0] = 0; + in_axes[1] = 2; + in_axes[2] = 3; + in_axes_count = 3; + } + else + { + in_axes[0] = 0; + in_axes[1] = 1; + in_axes[2] = 2; + in_axes[3] = 3; + in_axes_count = 4; + } + + if (keepdims) + { + outdims = dims; + + if (dims == 1) + { + out_w = reduce_w ? 1 : bottom_blob.w; + map_out_w = 3; + } + else if (dims == 2) + { + out_h = reduce_h ? 1 : bottom_blob.h; + out_w = reduce_w ? 1 : bottom_blob.w; + map_out_h = 2; + map_out_w = 3; + } + else if (dims == 3) + { + out_c = reduce_c ? 1 : bottom_blob.c; + out_h = reduce_h ? 1 : bottom_blob.h; + out_w = reduce_w ? 1 : bottom_blob.w; + map_out_c = 0; + map_out_h = 2; + map_out_w = 3; + } + else + { + out_c = reduce_c ? 1 : bottom_blob.c; + out_d = reduce_d ? 1 : bottom_blob.d; + out_h = reduce_h ? 1 : bottom_blob.h; + out_w = reduce_w ? 1 : bottom_blob.w; + map_out_c = 0; + map_out_d = 1; + map_out_h = 2; + map_out_w = 3; + } + + return; + } + + int keep_axes[4]; + int keep_count = 0; + for (int i = 0; i < in_axes_count; i++) + { + if (!is_reduced_axis(in_axes[i])) + keep_axes[keep_count++] = in_axes[i]; + } + + if (keep_count == 0) + { + outdims = 1; + out_w = 1; + return; + } + + outdims = keep_count; + + if (outdims == 1) + { + map_out_w = keep_axes[0]; + out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob); + } + else if (outdims == 2) + { + map_out_h = keep_axes[0]; + map_out_w = keep_axes[1]; + out_h = axis_size_from_vkmat(map_out_h, dims, bottom_blob); + out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob); + } + else if (outdims == 3) + { + map_out_c = keep_axes[0]; + map_out_h = keep_axes[1]; + map_out_w = keep_axes[2]; + out_c = axis_size_from_vkmat(map_out_c, dims, bottom_blob); + out_h = axis_size_from_vkmat(map_out_h, dims, bottom_blob); + out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob); + } + else + { + map_out_c = keep_axes[0]; + map_out_d = keep_axes[1]; + map_out_h = keep_axes[2]; + map_out_w = keep_axes[3]; + out_c = axis_size_from_vkmat(map_out_c, dims, bottom_blob); + out_d = axis_size_from_vkmat(map_out_d, dims, bottom_blob); + out_h = axis_size_from_vkmat(map_out_h, dims, bottom_blob); + out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob); + } +} + +static inline float compute_coeff2_for_mean(const VkMat& bottom_blob, + bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c, + float coeff) +{ + int scale = 1; + const int dims = bottom_blob.dims; + + if (dims == 1) + { + scale = bottom_blob.w; + } + else if (dims == 2) + { + if (reduce_w) scale *= bottom_blob.w; + if (reduce_h) scale *= bottom_blob.h; + } + else if (dims == 3) + { + if (reduce_w) scale *= bottom_blob.w; + if (reduce_h) scale *= bottom_blob.h; + if (reduce_c) scale *= bottom_blob.c; + } + else + { + if (reduce_w) scale *= bottom_blob.w; + if (reduce_h) scale *= bottom_blob.h; + if (reduce_d) scale *= bottom_blob.d; + if (reduce_c) scale *= bottom_blob.c; + } + + return coeff / scale; +} + +int Reduction_vulkan::create_pipeline(const Option& opt) +{ + pipeline_reduction = new Pipeline(vkdev); + pipeline_reduction->set_local_size_xyz(256, 1, 1); + + std::vector specializations(1); + specializations[0].i = operation; + + pipeline_reduction->create(LayerShaderType::reduction, opt, specializations); + return 0; +} + +int Reduction_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_reduction; + pipeline_reduction = 0; + return 0; +} + +int Reduction_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + if (bottom_blob.empty()) + return -100; + + bool reduce_w, reduce_h, reduce_d, reduce_c; + resolve_reduce_flags(bottom_blob.dims, reduce_all, axes, reduce_w, reduce_h, reduce_d, reduce_c); + + int outdims, out_w, out_h, out_d, out_c; + int map_out_w, map_out_h, map_out_d, map_out_c; + resolve_output_shape_and_mapping(bottom_blob, reduce_w, reduce_h, reduce_d, reduce_c, keepdims, + outdims, out_w, out_h, out_d, out_c, + map_out_w, map_out_h, map_out_d, map_out_c); + + const size_t elemsize = bottom_blob.elemsize; + + if (outdims == 1) + top_blob.create(out_w, elemsize, opt.blob_vkallocator); + else if (outdims == 2) + top_blob.create(out_w, out_h, elemsize, opt.blob_vkallocator); + else if (outdims == 3) + top_blob.create(out_w, out_h, out_c, elemsize, opt.blob_vkallocator); + else + top_blob.create(out_w, out_h, out_d, out_c, elemsize, opt.blob_vkallocator); + + if (top_blob.empty()) + return -100; + + float coeff2 = coeff; + if (operation == ReductionOp_MEAN) + coeff2 = compute_coeff2_for_mean(bottom_blob, reduce_w, reduce_h, reduce_d, reduce_c, coeff); + + std::vector bindings(2); + bindings[0] = top_blob; + bindings[1] = bottom_blob; + + std::vector constants(21); + constants[0].i = bottom_blob.w; + constants[1].i = bottom_blob.h; + constants[2].i = bottom_blob.d; + constants[3].i = bottom_blob.c; + constants[4].i = (int)bottom_blob.cstep; + constants[5].i = bottom_blob.dims; + + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.d; + constants[9].i = top_blob.c; + constants[10].i = top_blob.dims; + constants[11].i = (int)top_blob.cstep; + + constants[12].i = reduce_w ? 1 : 0; + constants[13].i = reduce_h ? 1 : 0; + constants[14].i = reduce_d ? 1 : 0; + constants[15].i = reduce_c ? 1 : 0; + + constants[16].i = map_out_w; + constants[17].i = map_out_h; + constants[18].i = map_out_d; + constants[19].i = map_out_c; + + constants[20].f = coeff2; + + VkMat dispatcher; + dispatcher.w = 256; + dispatcher.h = (int)top_blob.total(); + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_reduction, bindings, constants, dispatcher); + return 0; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/reduction_vulkan.h b/src/layer/vulkan/reduction_vulkan.h new file mode 100644 index 000000000000..b570873efa5d --- /dev/null +++ b/src/layer/vulkan/reduction_vulkan.h @@ -0,0 +1,28 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_REDUCTION_VULKAN_H +#define LAYER_REDUCTION_VULKAN_H + +#include "reduction.h" + +namespace ncnn { + +class Reduction_vulkan : public Reduction +{ +public: + Reduction_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + using Reduction::forward; + virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_reduction; +}; + +} // namespace ncnn + +#endif // LAYER_REDUCTION_VULKAN_H diff --git a/src/layer/vulkan/shader/reduction.comp b/src/layer/vulkan/shader/reduction.comp new file mode 100644 index 000000000000..d0f732220a5b --- /dev/null +++ b/src/layer/vulkan/shader/reduction.comp @@ -0,0 +1,255 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_subgroup_arithmetic +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#endif + +layout(constant_id = 0) const int op = 0; + +layout(binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout(binding = 1) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int in_w; + int in_h; + int in_d; + int in_c; + int in_cstep; + int in_dims; + + int out_w; + int out_h; + int out_d; + int out_c; + int out_dims; + int out_cstep; + + int reduce_w; + int reduce_h; + int reduce_d; + int reduce_c; + + int map_out_w; + int map_out_h; + int map_out_d; + int map_out_c; + + float coeff; +} p; + +shared lfp sdata[256]; + +const float FLT_MAX_VALUE = 3.402823466e+38; +const float FLT_MIN_VALUE = 1.175494351e-38; + +afp init_acc() +{ + if (op == 6) return afp(1.f); + if (op == 4) return afp(-FLT_MAX_VALUE); + if (op == 5) return afp(FLT_MAX_VALUE); + return afp(0.f); +} + +afp element_accum(afp acc, afp v) +{ + if (op == 0 || op == 3 || op == 9) return acc + v; + if (op == 1 || op == 7) return acc + abs(v); + if (op == 2 || op == 8) return acc + v * v; + if (op == 4) return max(acc, v); + if (op == 5) return min(acc, v); + if (op == 6) return acc * v; + if (op == 10) return acc + exp(v); + return acc + v; +} + +afp combine_accum(afp a, afp b) +{ + if (op == 4) return max(a, b); + if (op == 5) return min(a, b); + if (op == 6) return a * b; + return a + b; +} + +#if ncnn_subgroup_arithmetic +afp subgroup_reduce_acc(afp v) +{ + if (op == 4) return subgroupMax(v); + if (op == 5) return subgroupMin(v); + if (op == 6) return subgroupMul(v); + return subgroupAdd(v); +} +#endif + +void decode_out(int out_index, out int ow, out int oh, out int od, out int oc) +{ + ow = 0; + oh = 0; + od = 0; + oc = 0; + + if (p.out_dims == 1) + { + ow = out_index; + } + else if (p.out_dims == 2) + { + ow = out_index % p.out_w; + oh = out_index / p.out_w; + } + else if (p.out_dims == 3) + { + ow = out_index % p.out_w; + int t = out_index / p.out_w; + oh = t % p.out_h; + oc = t / p.out_h; + } + else + { + ow = out_index % p.out_w; + int t0 = out_index / p.out_w; + oh = t0 % p.out_h; + int t1 = t0 / p.out_h; + od = t1 % p.out_d; + oc = t1 / p.out_d; + } +} + +int in_index(int iw, int ih, int id, int ic) +{ + if (p.in_dims == 1) return iw; + if (p.in_dims == 2) return ih * p.in_w + iw; + if (p.in_dims == 3) return ic * p.in_cstep + ih * p.in_w + iw; + + int in_dstep = p.in_w * p.in_h; + if (p.in_d > 1) in_dstep = p.in_cstep / p.in_d; + return ic * p.in_cstep + id * in_dstep + ih * p.in_w + iw; +} + +int out_offset(int ow, int oh, int od, int oc) +{ + if (p.out_dims == 1) return ow; + if (p.out_dims == 2) return oh * p.out_w + ow; + if (p.out_dims == 3) return oc * p.out_cstep + oh * p.out_w + ow; + + int out_dstep = p.out_w * p.out_h; + if (p.out_d > 1) out_dstep = p.out_cstep / p.out_d; + return oc * p.out_cstep + od * out_dstep + oh * p.out_w + ow; +} + +afp finalize_out(afp outv) +{ + if (op == 9 || op == 10) outv = log(outv); + if (op == 8) outv = sqrt(outv < afp(FLT_MIN_VALUE) ? afp(0.f) : outv); + if (p.coeff != 1.f) outv = outv * afp(p.coeff); + return outv; +} + +void main() +{ + const int out_index = int(gl_WorkGroupID.y); + const int tid = int(gl_LocalInvocationID.x); + + int ow, oh, od, oc; + decode_out(out_index, ow, oh, od, oc); + + int in_coord[4] = int[4](0, 0, 0, 0); + if (p.map_out_w >= 0) in_coord[p.map_out_w] = ow; + if (p.map_out_h >= 0) in_coord[p.map_out_h] = oh; + if (p.map_out_d >= 0) in_coord[p.map_out_d] = od; + if (p.map_out_c >= 0) in_coord[p.map_out_c] = oc; + + int in_w = p.in_w; + int in_h = (p.in_h > 0) ? p.in_h : 1; + int in_d = (p.in_d > 0) ? p.in_d : 1; + int in_c = (p.in_c > 0) ? p.in_c : 1; + + int reduce_size = 1; + if (p.reduce_w != 0) reduce_size *= in_w; + if (p.reduce_h != 0) reduce_size *= in_h; + if (p.in_dims == 4 && p.reduce_d != 0) reduce_size *= in_d; + if (p.in_dims >= 3 && p.reduce_c != 0) reduce_size *= in_c; + + afp acc = init_acc(); + + for (int t = tid; t < reduce_size; t += 256) + { + int tt = t; + + int iw = in_coord[3]; + int ih = in_coord[2]; + int id = in_coord[1]; + int ic = in_coord[0]; + + if (p.reduce_w != 0) + { + iw = tt % in_w; + tt /= in_w; + } + if (p.reduce_h != 0) + { + ih = tt % in_h; + tt /= in_h; + } + if (p.in_dims == 4 && p.reduce_d != 0) + { + id = tt % in_d; + tt /= in_d; + } + if (p.in_dims >= 3 && p.reduce_c != 0) { ic = tt % in_c; } + + afp v = buffer_ld1(bottom_blob_data, in_index(iw, ih, id, ic)); + acc = element_accum(acc, v); + } + +#if ncnn_subgroup_arithmetic + afp sg = subgroup_reduce_acc(acc); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = sfp2lfp(sg); + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? lfp2afp(sdata[lane]) : init_acc(); + afp outv = subgroup_reduce_acc(v); + + if (subgroupElect()) + { + outv = finalize_out(outv); + buffer_st1(top_blob_data, out_offset(ow, oh, od, oc), outv); + } + } +#else + sdata[tid] = sfp2lfp(acc); + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + afp a = lfp2afp(sdata[tid]); + afp b = lfp2afp(sdata[tid + stride]); + sdata[tid] = sfp2lfp(combine_accum(a, b)); + } + barrier(); + } + + if (tid == 0) + { + afp outv = lfp2afp(sdata[0]); + outv = finalize_out(outv); + buffer_st1(top_blob_data, out_offset(ow, oh, od, oc), outv); + } +#endif +}