diff --git a/src/layer/vulkan/reduction_vulkan.cpp b/src/layer/vulkan/reduction_vulkan.cpp
new file mode 100644
index 000000000000..91caaad76e22
--- /dev/null
+++ b/src/layer/vulkan/reduction_vulkan.cpp
@@ -0,0 +1,371 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "reduction_vulkan.h"
+
+#include <vector>
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+Reduction_vulkan::Reduction_vulkan()
+{
+    support_vulkan = true;
+    support_vulkan_packing = false;
+    support_vulkan_any_packing = false;
+
+    pipeline_reduction = 0;
+}
+
+static inline int axis_size_from_vkmat(int axis, int dims, const VkMat& m)
+{
+    if (dims == 1)
+        return axis == 3 ? m.w : 1;
+
+    if (dims == 2)
+    {
+        if (axis == 2) return m.h;
+        if (axis == 3) return m.w;
+        return 1;
+    }
+
+    if (dims == 3)
+    {
+        if (axis == 0) return m.c;
+        if (axis == 2) return m.h;
+        if (axis == 3) return m.w;
+        return 1;
+    }
+
+    if (axis == 0) return m.c;
+    if (axis == 1) return m.d;
+    if (axis == 2) return m.h;
+    if (axis == 3) return m.w;
+    return 1;
+}
+
+static inline void resolve_reduce_flags(int dims, int reduce_all, const Mat& axes,
+                                        bool& reduce_w, bool& reduce_h, bool& reduce_d, bool& reduce_c)
+{
+    reduce_w = false;
+    reduce_h = false;
+    reduce_d = false;
+    reduce_c = false;
+
+    if (reduce_all)
+    {
+        reduce_w = true;
+        reduce_h = true;
+        reduce_d = true;
+        reduce_c = true;
+        return;
+    }
+
+    int axes_flag[4] = {0, 0, 0, 0};
+    const int* axes_ptr = axes;
+    const int axes_count = axes.w;
+
+    for (int i = 0; i < axes_count; i++)
+    {
+        int axis = axes_ptr[i];
+        if (axis < 0) axis += dims;
+        if (axis >= 0 && axis < 4) axes_flag[axis] = 1;
+    }
+
+    if (dims == 1)
+    {
+        reduce_w = true;
+    }
+    else if (dims == 2)
+    {
+        if (axes_flag[0]) reduce_h = true;
+        if (axes_flag[1]) reduce_w = true;
+    }
+    else if (dims == 3)
+    {
+        if (axes_flag[0]) reduce_c = true;
+        if (axes_flag[1]) reduce_h = true;
+        if (axes_flag[2]) reduce_w = true;
+    }
+    else
+    {
+        if (axes_flag[0]) reduce_c = true;
+        if (axes_flag[1]) reduce_d = true;
+        if (axes_flag[2]) reduce_h = true;
+        if (axes_flag[3]) reduce_w = true;
+    }
+}
+
+static inline void resolve_output_shape_and_mapping(const VkMat& bottom_blob,
+        bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c,
+        int keepdims,
+        int& outdims, int& out_w, int& out_h, int& out_d, int& out_c,
+        int& map_out_w, int& map_out_h, int& map_out_d, int& map_out_c)
+{
+    const int dims = bottom_blob.dims;
+
+    outdims = 1;
+    out_w = 1;
+    out_h = 1;
+    out_d = 1;
+    out_c = 1;
+
+    map_out_w = -1;
+    map_out_h = -1;
+    map_out_d = -1;
+    map_out_c = -1;
+
+    auto is_reduced_axis = [&](int axis) -> bool {
+        if (axis == 0) return reduce_c;
+        if (axis == 1) return reduce_d;
+        if (axis == 2) return reduce_h;
+        if (axis == 3) return reduce_w;
+        return false;
+    };
+
+    int in_axes[4];
+    int in_axes_count = 0;
+    if (dims == 1)
+    {
+        in_axes[0] = 3;
+        in_axes_count = 1;
+    }
+    else if (dims == 2)
+    {
+        in_axes[0] = 2;
+        in_axes[1] = 3;
+        in_axes_count = 2;
+    }
+    else if (dims == 3)
+    {
+        in_axes[0] = 0;
+        in_axes[1] = 2;
+        in_axes[2] = 3;
+        in_axes_count = 3;
+    }
+    else
+    {
+        in_axes[0] = 0;
+        in_axes[1] = 1;
+        in_axes[2] = 2;
+        in_axes[3] = 3;
+        in_axes_count = 4;
+    }
+
+    if (keepdims)
+    {
+        outdims = dims;
+
+        if (dims == 1)
+        {
+            out_w = reduce_w ? 1 : bottom_blob.w;
+            map_out_w = 3;
+        }
+        else if (dims == 2)
+        {
+            out_h = reduce_h ? 1 : bottom_blob.h;
+            out_w = reduce_w ? 1 : bottom_blob.w;
+            map_out_h = 2;
+            map_out_w = 3;
+        }
+        else if (dims == 3)
+        {
+            out_c = reduce_c ? 1 : bottom_blob.c;
+            out_h = reduce_h ? 1 : bottom_blob.h;
+            out_w = reduce_w ? 1 : bottom_blob.w;
+            map_out_c = 0;
+            map_out_h = 2;
+            map_out_w = 3;
+        }
+        else
+        {
+            out_c = reduce_c ? 1 : bottom_blob.c;
+            out_d = reduce_d ? 1 : bottom_blob.d;
+            out_h = reduce_h ? 1 : bottom_blob.h;
+            out_w = reduce_w ? 1 : bottom_blob.w;
+            map_out_c = 0;
+            map_out_d = 1;
+            map_out_h = 2;
+            map_out_w = 3;
+        }
+
+        return;
+    }
+
+    int keep_axes[4];
+    int keep_count = 0;
+    for (int i = 0; i < in_axes_count; i++)
+    {
+        if (!is_reduced_axis(in_axes[i]))
+            keep_axes[keep_count++] = in_axes[i];
+    }
+
+    if (keep_count == 0)
+    {
+        outdims = 1;
+        out_w = 1;
+        return;
+    }
+
+    outdims = keep_count;
+
+    if (outdims == 1)
+    {
+        map_out_w = keep_axes[0];
+        out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob);
+    }
+    else if (outdims == 2)
+    {
+        map_out_h = keep_axes[0];
+        map_out_w = keep_axes[1];
+        out_h = axis_size_from_vkmat(map_out_h, dims, bottom_blob);
+        out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob);
+    }
+    else if (outdims == 3)
+    {
+        map_out_c = keep_axes[0];
+        map_out_h = keep_axes[1];
+        map_out_w = keep_axes[2];
+        out_c = axis_size_from_vkmat(map_out_c, dims, bottom_blob);
+        out_h = axis_size_from_vkmat(map_out_h, dims, bottom_blob);
+        out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob);
+    }
+    else
+    {
+        map_out_c = keep_axes[0];
+        map_out_d = keep_axes[1];
+        map_out_h = keep_axes[2];
+        map_out_w = keep_axes[3];
+        out_c = axis_size_from_vkmat(map_out_c, dims, bottom_blob);
+        out_d = axis_size_from_vkmat(map_out_d, dims, bottom_blob);
+        out_h = axis_size_from_vkmat(map_out_h, dims, bottom_blob);
+        out_w = axis_size_from_vkmat(map_out_w, dims, bottom_blob);
+    }
+}
+
+static inline float compute_coeff2_for_mean(const VkMat& bottom_blob,
+        bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c,
+        float coeff)
+{
+    int scale = 1;
+    const int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        scale = bottom_blob.w;
+    }
+    else if (dims == 2)
+    {
+        if (reduce_w) scale *= bottom_blob.w;
+        if (reduce_h) scale *= bottom_blob.h;
+    }
+    else if (dims == 3)
+    {
+        if (reduce_w) scale *= bottom_blob.w;
+        if (reduce_h) scale *= bottom_blob.h;
+        if (reduce_c) scale *= bottom_blob.c;
+    }
+    else
+    {
+        if (reduce_w) scale *= bottom_blob.w;
+        if (reduce_h) scale *= bottom_blob.h;
+        if (reduce_d) scale *= bottom_blob.d;
+        if (reduce_c) scale *= bottom_blob.c;
+    }
+
+    return coeff / scale;
+}
+
+int Reduction_vulkan::create_pipeline(const Option& opt)
+{
+    pipeline_reduction = new Pipeline(vkdev);
+    pipeline_reduction->set_local_size_xyz(256, 1, 1);
+
+    std::vector<vk_specialization_type> specializations(1);
+    specializations[0].i = operation;
+
+    pipeline_reduction->create(LayerShaderType::reduction, opt, specializations);
+    return 0;
+}
+
+int Reduction_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_reduction;
+    pipeline_reduction = 0;
+    return 0;
+}
+
+int Reduction_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    if (bottom_blob.empty())
+        return -100;
+
+    bool reduce_w, reduce_h, reduce_d, reduce_c;
+    resolve_reduce_flags(bottom_blob.dims, reduce_all, axes, reduce_w, reduce_h, reduce_d, reduce_c);
+
+    int outdims, out_w, out_h, out_d, out_c;
+    int map_out_w, map_out_h, map_out_d, map_out_c;
+    resolve_output_shape_and_mapping(bottom_blob, reduce_w, reduce_h, reduce_d, reduce_c, keepdims,
+                                     outdims, out_w, out_h, out_d, out_c,
+                                     map_out_w, map_out_h, map_out_d, map_out_c);
+
+    const size_t elemsize = bottom_blob.elemsize;
+
+    if (outdims == 1)
+        top_blob.create(out_w, elemsize, opt.blob_vkallocator);
+    else if (outdims == 2)
+        top_blob.create(out_w, out_h, elemsize, opt.blob_vkallocator);
+    else if (outdims == 3)
+        top_blob.create(out_w, out_h, out_c, elemsize, opt.blob_vkallocator);
+    else
+        top_blob.create(out_w, out_h, out_d, out_c, elemsize, opt.blob_vkallocator);
+
+    if (top_blob.empty())
+        return -100;
+
+    float coeff2 = coeff;
+    if (operation == ReductionOp_MEAN)
+        coeff2 = compute_coeff2_for_mean(bottom_blob, reduce_w, reduce_h, reduce_d, reduce_c, coeff);
+
+    std::vector<VkMat> bindings(2);
+    bindings[0] = top_blob;
+    bindings[1] = bottom_blob;
+
+    std::vector<vk_constant_type> constants(21);
+    constants[0].i = bottom_blob.w;
+    constants[1].i = bottom_blob.h;
+    constants[2].i = bottom_blob.d;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = (int)bottom_blob.cstep;
+    constants[5].i = bottom_blob.dims;
+
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.d;
+    constants[9].i = top_blob.c;
+    constants[10].i = top_blob.dims;
+    constants[11].i = (int)top_blob.cstep;
+
+    constants[12].i = reduce_w ? 1 : 0;
+    constants[13].i = reduce_h ? 1 : 0;
+    constants[14].i = reduce_d ? 1 : 0;
+    constants[15].i = reduce_c ? 1 : 0;
+
+    constants[16].i = map_out_w;
+    constants[17].i = map_out_h;
+    constants[18].i = map_out_d;
+    constants[19].i = map_out_c;
+
+    constants[20].f = coeff2;
+
+    VkMat dispatcher;
+    dispatcher.w = 256;
+    dispatcher.h = (int)top_blob.total();
+    dispatcher.c = 1;
+
+    cmd.record_pipeline(pipeline_reduction, bindings, constants, dispatcher);
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/reduction_vulkan.h b/src/layer/vulkan/reduction_vulkan.h
new file mode 100644
index 000000000000..b570873efa5d
--- /dev/null
+++ b/src/layer/vulkan/reduction_vulkan.h
@@ -0,0 +1,28 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_REDUCTION_VULKAN_H
+#define LAYER_REDUCTION_VULKAN_H
+
+#include "reduction.h"
+
+namespace ncnn {
+
+class Reduction_vulkan : public Reduction
+{
+public:
+    Reduction_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    using Reduction::forward;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_reduction;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REDUCTION_VULKAN_H
diff --git a/src/layer/vulkan/shader/reduction.comp b/src/layer/vulkan/shader/reduction.comp
new file mode 100644
index 000000000000..d0f732220a5b
--- /dev/null
+++ b/src/layer/vulkan/shader/reduction.comp
@@ -0,0 +1,255 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_subgroup_arithmetic
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#endif
+
+layout(constant_id = 0) const int op = 0;
+
+layout(binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout(binding = 1) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int in_w;
+    int in_h;
+    int in_d;
+    int in_c;
+    int in_cstep;
+    int in_dims;
+
+    int out_w;
+    int out_h;
+    int out_d;
+    int out_c;
+    int out_dims;
+    int out_cstep;
+
+    int reduce_w;
+    int reduce_h;
+    int reduce_d;
+    int reduce_c;
+
+    int map_out_w;
+    int map_out_h;
+    int map_out_d;
+    int map_out_c;
+
+    float coeff;
+} p;
+
+shared lfp sdata[256];
+
+const float FLT_MAX_VALUE = 3.402823466e+38;
+const float FLT_MIN_VALUE = 1.175494351e-38;
+
+afp init_acc()
+{
+    if (op == 6) return afp(1.f);
+    if (op == 4) return afp(-FLT_MAX_VALUE);
+    if (op == 5) return afp(FLT_MAX_VALUE);
+    return afp(0.f);
+}
+
+afp element_accum(afp acc, afp v)
+{
+    if (op == 0 || op == 3 || op == 9) return acc + v;
+    if (op == 1 || op == 7) return acc + abs(v);
+    if (op == 2 || op == 8) return acc + v * v;
+    if (op == 4) return max(acc, v);
+    if (op == 5) return min(acc, v);
+    if (op == 6) return acc * v;
+    if (op == 10) return acc + exp(v);
+    return acc + v;
+}
+
+afp combine_accum(afp a, afp b)
+{
+    if (op == 4) return max(a, b);
+    if (op == 5) return min(a, b);
+    if (op == 6) return a * b;
+    return a + b;
+}
+
+#if ncnn_subgroup_arithmetic
+afp subgroup_reduce_acc(afp v)
+{
+    if (op == 4) return subgroupMax(v);
+    if (op == 5) return subgroupMin(v);
+    if (op == 6) return subgroupMul(v);
+    return subgroupAdd(v);
+}
+#endif
+
+void decode_out(int out_index, out int ow, out int oh, out int od, out int oc)
+{
+    ow = 0;
+    oh = 0;
+    od = 0;
+    oc = 0;
+
+    if (p.out_dims == 1)
+    {
+        ow = out_index;
+    }
+    else if (p.out_dims == 2)
+    {
+        ow = out_index % p.out_w;
+        oh = out_index / p.out_w;
+    }
+    else if (p.out_dims == 3)
+    {
+        ow = out_index % p.out_w;
+        int t = out_index / p.out_w;
+        oh = t % p.out_h;
+        oc = t / p.out_h;
+    }
+    else
+    {
+        ow = out_index % p.out_w;
+        int t0 = out_index / p.out_w;
+        oh = t0 % p.out_h;
+        int t1 = t0 / p.out_h;
+        od = t1 % p.out_d;
+        oc = t1 / p.out_d;
+    }
+}
+
+int in_index(int iw, int ih, int id, int ic)
+{
+    if (p.in_dims == 1) return iw;
+    if (p.in_dims == 2) return ih * p.in_w + iw;
+    if (p.in_dims == 3) return ic * p.in_cstep + ih * p.in_w + iw;
+
+    int in_dstep = p.in_w * p.in_h;
+    if (p.in_d > 1) in_dstep = p.in_cstep / p.in_d;
+    return ic * p.in_cstep + id * in_dstep + ih * p.in_w + iw;
+}
+
+int out_offset(int ow, int oh, int od, int oc)
+{
+    if (p.out_dims == 1) return ow;
+    if (p.out_dims == 2) return oh * p.out_w + ow;
+    if (p.out_dims == 3) return oc * p.out_cstep + oh * p.out_w + ow;
+
+    int out_dstep = p.out_w * p.out_h;
+    if (p.out_d > 1) out_dstep = p.out_cstep / p.out_d;
+    return oc * p.out_cstep + od * out_dstep + oh * p.out_w + ow;
+}
+
+afp finalize_out(afp outv)
+{
+    if (op == 9 || op == 10) outv = log(outv);
+    if (op == 8) outv = sqrt(outv < afp(FLT_MIN_VALUE) ? afp(0.f) : outv);
+    if (p.coeff != 1.f) outv = outv * afp(p.coeff);
+    return outv;
+}
+
+void main()
+{
+    const int out_index = int(gl_WorkGroupID.y);
+    const int tid = int(gl_LocalInvocationID.x);
+
+    int ow, oh, od, oc;
+    decode_out(out_index, ow, oh, od, oc);
+
+    int in_coord[4] = int[4](0, 0, 0, 0);
+    if (p.map_out_w >= 0) in_coord[p.map_out_w] = ow;
+    if (p.map_out_h >= 0) in_coord[p.map_out_h] = oh;
+    if (p.map_out_d >= 0) in_coord[p.map_out_d] = od;
+    if (p.map_out_c >= 0) in_coord[p.map_out_c] = oc;
+
+    int in_w = p.in_w;
+    int in_h = (p.in_h > 0) ? p.in_h : 1;
+    int in_d = (p.in_d > 0) ? p.in_d : 1;
+    int in_c = (p.in_c > 0) ? p.in_c : 1;
+
+    int reduce_size = 1;
+    if (p.reduce_w != 0) reduce_size *= in_w;
+    if (p.reduce_h != 0) reduce_size *= in_h;
+    if (p.in_dims == 4 && p.reduce_d != 0) reduce_size *= in_d;
+    if (p.in_dims >= 3 && p.reduce_c != 0) reduce_size *= in_c;
+
+    afp acc = init_acc();
+
+    for (int t = tid; t < reduce_size; t += 256)
+    {
+        int tt = t;
+
+        int iw = in_coord[3];
+        int ih = in_coord[2];
+        int id = in_coord[1];
+        int ic = in_coord[0];
+
+        if (p.reduce_w != 0)
+        {
+            iw = tt % in_w;
+            tt /= in_w;
+        }
+        if (p.reduce_h != 0)
+        {
+            ih = tt % in_h;
+            tt /= in_h;
+        }
+        if (p.in_dims == 4 && p.reduce_d != 0)
+        {
+            id = tt % in_d;
+            tt /= in_d;
+        }
+        if (p.in_dims >= 3 && p.reduce_c != 0) { ic = tt % in_c; }
+
+        afp v = buffer_ld1(bottom_blob_data, in_index(iw, ih, id, ic));
+        acc = element_accum(acc, v);
+    }
+
+#if ncnn_subgroup_arithmetic
+    afp sg = subgroup_reduce_acc(acc);
+
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = sfp2lfp(sg);
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        afp v = lane < num_sg ? lfp2afp(sdata[lane]) : init_acc();
+        afp outv = subgroup_reduce_acc(v);
+
+        if (subgroupElect())
+        {
+            outv = finalize_out(outv);
+            buffer_st1(top_blob_data, out_offset(ow, oh, od, oc), outv);
+        }
+    }
+#else
+    sdata[tid] = sfp2lfp(acc);
+    barrier();
+
+    for (int stride = 128; stride > 0; stride >>= 1)
+    {
+        if (tid < stride)
+        {
+            afp a = lfp2afp(sdata[tid]);
+            afp b = lfp2afp(sdata[tid + stride]);
+            sdata[tid] = sfp2lfp(combine_accum(a, b));
+        }
+        barrier();
+    }
+
+    if (tid == 0)
+    {
+        afp outv = lfp2afp(sdata[0]);
+        outv = finalize_out(outv);
+        buffer_st1(top_blob_data, out_offset(ow, oh, od, oc), outv);
+    }
+#endif
+}