wip

ggerganov · ggerganov · commit 4d75c05e09b3 · 2025-11-18T18:06:16.000+02:00
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -7888,7 +7888,7 @@ static void ggml_compute_forward_top_k_f32(
 
     const int64_t nr = ggml_nrows(src0);
 
-    const int k = ggml_get_op_params_i32(dst, 0);
+    const int top_k = ne0;
 
     int32_t * tmp = (int32_t *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
@@ -7899,11 +7899,11 @@ static void ggml_compute_forward_top_k_f32(
             tmp[j] = j;
         }
 
-        std::partial_sort(tmp, tmp + k, tmp + ne00, cmp_top_k{src_data});
+        std::partial_sort(tmp, tmp + top_k, tmp + ne00, cmp_top_k{src_data});
 
         int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
 
-        std::copy(tmp, tmp + k, dst_data);
+        std::copy(tmp, tmp + top_k, dst_data);
     }
 }
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -905,6 +905,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_LEAKY_RELU:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ARGSORT:
+        case GGML_OP_TOP_K:
         case GGML_OP_ARANGE:
             return true;
         case GGML_OP_FLASH_ATTN_EXT:
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -840,6 +840,7 @@ typedef struct {
     uint64_t nb01;
     uint64_t nb02;
     uint64_t nb03;
+    int32_t  top_k;
 } ggml_metal_kargs_argsort;
 
 typedef struct {
@@ -851,6 +852,7 @@ typedef struct {
     uint64_t nb01;
     uint64_t nb02;
     uint64_t nb03;
+    int32_t  top_k;
     int32_t  len;
 } ggml_metal_kargs_argsort_merge;
 
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -405,6 +405,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = ggml_metal_op_argsort(ctx, idx);
             } break;
+        case GGML_OP_TOP_K:
+            {
+                n_fuse = ggml_metal_op_top_k(ctx, idx);
+            } break;
         case GGML_OP_LEAKY_RELU:
             {
                 n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
@@ -3677,14 +3681,15 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
     }
 
     ggml_metal_kargs_argsort args = {
-        /*.ne00 =*/ ne00,
-        /*.ne01 =*/ ne01,
-        /*.ne02 =*/ ne02,
-        /*.ne03 =*/ ne03,
-        /*.nb00 =*/ nb00,
-        /*.nb01 =*/ nb01,
-        /*.nb02 =*/ nb02,
-        /*.nb03 =*/ nb03,
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.top_k =*/ ne00,
     };
 
     ggml_metal_encoder_set_pipeline(enc, pipeline);
@@ -3704,15 +3709,117 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
         ggml_metal_op_concurrency_reset(ctx);
 
         ggml_metal_kargs_argsort_merge args_merge = {
-            .ne00 = ne00,
-            .ne01 = ne01,
-            .ne02 = ne02,
-            .ne03 = ne03,
-            .nb00 = nb00,
-            .nb01 = nb01,
-            .nb02 = nb02,
-            .nb03 = nb03,
-            .len  = len,
+            /*.ne00  =*/ ne00,
+            /*.ne01  =*/ ne01,
+            /*.ne02  =*/ ne02,
+            /*.ne03  =*/ ne03,
+            /*.nb00  =*/ nb00,
+            /*.nb01  =*/ nb01,
+            /*.nb02  =*/ nb02,
+            /*.nb03  =*/ nb03,
+            /*.top_k =*/ ne00,
+            /*.len   =*/ len,
+        };
+
+        // merges per row
+        const int nm = (ne00 + 2*len - 1) / (2*len);
+
+        const int nth = std::min(512, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_merge));
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline_merge);
+        ggml_metal_encoder_set_bytes   (enc, &args_merge, sizeof(args_merge), 0);
+        ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+        ggml_metal_encoder_set_buffer  (enc, bid_tmp,  3);
+
+        ggml_metal_encoder_dispatch_threadgroups(enc, nm*ne01, ne02, ne03, nth, 1, 1);
+
+        std::swap(bid_dst, bid_tmp);
+
+        len <<= 1;
+    }
+
+    return 1;
+}
+
+int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
+
+    // bitonic sort requires the number of elements to be power of 2
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+
+    const int npr = (ne00 + nth - 1)/nth;
+
+    // Metal kernels require the buffer size to be multiple of 16 bytes
+    // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
+    const size_t smem = GGML_PAD(nth*sizeof(int32_t), 16);
+
+    ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
+    ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
+
+    ggml_metal_buffer_id bid_tmp = bid_dst;
+    bid_tmp.offs += sizeof(int32_t)*ggml_nelements(op->src[0]);
+
+    if ((int) ceil(std::log(npr) / std::log(2)) % 2 == 1) {
+        std::swap(bid_dst, bid_tmp);
+    }
+
+    const int top_k = ne0;
+
+    ggml_metal_kargs_argsort args = {
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.top_k =*/ nth < ne00 ? ne00 : top_k,
+    };
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+    ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
+
+    ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
+
+    int len = nth;
+
+    while (len < ne00) {
+        ggml_metal_op_concurrency_reset(ctx);
+
+        ggml_metal_kargs_argsort_merge args_merge = {
+            /*.ne00  =*/ ne00,
+            /*.ne01  =*/ ne01,
+            /*.ne02  =*/ ne02,
+            /*.ne03  =*/ ne03,
+            /*.nb00  =*/ nb00,
+            /*.nb01  =*/ nb01,
+            /*.nb02  =*/ nb02,
+            /*.nb03  =*/ nb03,
+            /*.top_k =*/ 2*len >= ne00 ? top_k : ne00,
+            /*.len   =*/ len,
         };
 
         // merges per row
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -81,6 +81,7 @@ int ggml_metal_op_arange            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_argmax            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_argsort           (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_top_k             (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_leaky_relu        (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_opt_step_adamw    (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_opt_step_sgd      (ggml_metal_op_t ctx, int idx);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -202,6 +202,10 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_
             {
                 res *= 2;
             } break;
+        case GGML_OP_TOP_K:
+            {
+                res = 2*sizeof(int32_t)*ggml_nelements(tensor->src[0]);
+            } break;
         default:
             break;
     }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4711,8 +4711,8 @@ kernel void kernel_argsort_f32_i32(
     }
 
     // copy the result to dst without the padding
-    if (i00 + col < args.ne00) {
-        dst += i00 + args.ne00*i01 + args.ne00*args.ne01*i02 + args.ne00*args.ne01*args.ne02*i03;
+    if (i00 + col < args.ne00 && col < args.top_k) {
+        dst += i00 + args.top_k*i01 + args.top_k*args.ne01*i02 + args.top_k*args.ne01*args.ne02*i03;
 
         dst[col] = shmem_i32[col];
     }
@@ -4760,9 +4760,9 @@ kernel void kernel_argsort_merge_f32_i32(
     device const int32_t * tmp1 = tmp0 + args.len;
 
     dst += start
-        + i01*args.ne00
-        + i02*args.ne00*args.ne01
-        + i03*args.ne00*args.ne01*args.ne02;
+        + i01*args.top_k
+        + i02*args.top_k*args.ne01
+        + i03*args.top_k*args.ne01*args.ne02;
 
     device const float * src0_row = (device const float *)(src0
         + args.nb01*i01
@@ -4827,7 +4827,7 @@ kernel void kernel_argsort_merge_f32_i32(
         val1 = src0_row[idx1];
     }
 
-    for (int k = k0; k < k1; ++k) {
+    for (int k = k0; k < k1 && k < args.top_k; ++k) {
         int32_t out_idx;
 
         if (i >= len0) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5158,7 +5158,8 @@ struct ggml_tensor * ggml_top_k(
 
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_I32, k, a->ne[1], a->ne[2], a->ne[3]);
 
-    ggml_set_op_params_i32(result, 0, (int32_t) k);
+    // TODO: tmp
+    ggml_set_op_params_i32(result, 0, (int32_t) GGML_SORT_ORDER_DESC);
 
     result->op     = GGML_OP_TOP_K;
     result->src[0] = a;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4933,7 +4933,49 @@ struct test_argsort : public test_case {
     }
 };
 
-struct test_topk_moe: public test_case {
+// GGML_OP_TOP_K
+struct test_top_k : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int k;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, k);
+    }
+
+    test_top_k(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {16, 10, 10, 10},
+            int k = 4)
+        : type(type), ne(ne), k(k) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_top_k(ctx, a, k);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // initialize with unique values to avoid ties
+            for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                std::vector<float> data(t->ne[0]);
+                for (int i = 0; i < t->ne[0]; i++) {
+                    data[i] = i;
+                }
+                std::shuffle(data.begin(), data.end(), rng);
+                ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+            }
+        }
+    }
+};
+
+struct test_topk_moe : public test_case {
     const std::array<int64_t, 4> ne;
     const int n_expert_used;
     const bool with_norm;
@@ -7514,6 +7556,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection)
     }
 
+    for (int k : {1, 2, 3, 7, 15}) {
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {16, 10, 10, 10}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {60, 10, 10, 10}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {1023, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {1024, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {1025, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {16384, 1, 1, 1}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2047, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2048, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2049, 2, 1, 3}, k));
+    }
+
     for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
         test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
         test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
@@ -7886,6 +7940,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     }
 
     test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {65000, 16, 1, 1}));
+    test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {65000, 16, 1, 1}, 40));
 
     return test_cases;
 }

Original file line number	Diff line number	Diff line change
`@@ -7888,7 +7888,7 @@ static void ggml_compute_forward_top_k_f32(`
`7888`	`7888`
`7889`	`7889`	`const int64_t nr = ggml_nrows(src0);`
`7890`	`7890`
`7891`		`- const int k = ggml_get_op_params_i32(dst, 0);`
	`7891`	`+ const int top_k = ne0;`
`7892`	`7892`
`7893`	`7893`	`int32_t * tmp = (int32_t ) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) ith;`
`7894`	`7894`
`@@ -7899,11 +7899,11 @@ static void ggml_compute_forward_top_k_f32(`
`7899`	`7899`	`tmp[j] = j;`
`7900`	`7900`	`}`
`7901`	`7901`
`7902`		`- std::partial_sort(tmp, tmp + k, tmp + ne00, cmp_top_k{src_data});`
	`7902`	`+ std::partial_sort(tmp, tmp + top_k, tmp + ne00, cmp_top_k{src_data});`
`7903`	`7903`
`7904`	`7904`	`int32_t * dst_data = (int32_t )((char ) dst->data + i*nb1);`
`7905`	`7905`
`7906`		`- std::copy(tmp, tmp + k, dst_data);`
	`7906`	`+ std::copy(tmp, tmp + top_k, dst_data);`
`7907`	`7907`	`}`
`7908`	`7908`	`}`
`7909`	`7909`
Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,10 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_`
`202`	`202`	`{`
`203`	`203`	`res *= 2;`
`204`	`204`	`} break;`
	`205`	`+ case GGML_OP_TOP_K:`
	`206`	`+ {`
	`207`	`+ res = 2sizeof(int32_t)ggml_nelements(tensor->src[0]);`
	`208`	`+ } break;`
`205`	`209`	`default:`
`206`	`210`	`break;`
`207`	`211`	`}`