From 58f0c22f0408b139bf74ee5f87973a451e6381b0 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 29 Oct 2024 11:07:54 +0100
Subject: [PATCH] ggml: add llama-att-softmax

This commit adds an example of how the attention mask is used with the
QK attention scores in Llama.cpp's attention softmax.
---
 fundamentals/ggml/src/llama-att-softmax.c | 78 +++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 fundamentals/ggml/src/llama-att-softmax.c

diff --git a/fundamentals/ggml/src/llama-att-softmax.c b/fundamentals/ggml/src/llama-att-softmax.c
new file mode 100644
index 0000000..a29b95a
--- /dev/null
+++ b/fundamentals/ggml/src/llama-att-softmax.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+/*
+ * This example is intended to get a feel/understanding for how the attention mask
+ * used with the QK attention scores work in Llama.cpp's attention softmax.
+ */
+int main(int argc, char **argv) {
+  printf("GGML llama attention softmax example\n");
+
+  struct ggml_init_params params = {
+    .mem_size   = 16*1024*1024,
+    .mem_buffer = NULL,
+  };
+  struct ggml_context* ctx = ggml_init(params);
+
+  struct ggml_tensor* logits = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 32, 1, 1);
+  ggml_set_name(logits, "logits");
+
+  float tensor_data[32] = {
+      0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0,
+      10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
+      20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
+      30.0, 31.0
+  };
+  memcpy((char *)logits->data, tensor_data, ggml_nbytes(logits));
+  for (int i = 0; i < ggml_nelements(logits); i++) {
+	float value = *(float *) ((char *) logits->data + i * logits->nb[0]); 
+	printf("%.4f ", value);
+  }
+  printf("\n");
+
+  struct ggml_tensor* mask = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 32);
+  ggml_set_name(mask, "mask");
+  float mask_data[32] = {
+      -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, 0.0f, 0.0f, 0.0f, 0.0f,
+      0.0f     , 0.0f     , 0.0f     , 0.0f     , -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY,
+      -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY,
+      -INFINITY, -INFINITY,
+  };
+  memcpy((char *)mask->data, mask_data, ggml_nbytes(mask));
+  for (int i = 0; i < ggml_nelements(mask); i++) {
+      float value = *(float *) ((char *) mask->data + i * mask->nb[0]);
+      printf("%.4f ", value);
+  }
+  printf("\n");
+
+  struct ggml_tensor* result = ggml_soft_max_ext(ctx, logits, mask, 1.0f, 0.0f);
+  ggml_set_name(result, "result");
+
+  struct ggml_cgraph* c_graph = ggml_new_graph(ctx);
+  ggml_build_forward_expand(c_graph, result);
+  int n_threads = 1;
+  enum ggml_status st = ggml_graph_compute_with_ctx(ctx, c_graph, n_threads);
+  if (st != GGML_STATUS_SUCCESS) {
+    printf("could not compute graph\n");
+    return 1;
+  }
+
+  printf("result tensor type: %s\n", ggml_type_name(result->type));
+  printf("result dim: %d\n", ggml_n_dims(result));
+  printf("result dim[0]: %ld\n", result->ne[0]);
+  float sum = 0.0f;
+  for (int i = 0; i < ggml_nelements(result); i++) {
+    float value = *(float *) ((char *) result->data + i * result->nb[0]); 
+	printf("%.4f ", value);
+    sum += value;
+  }
+  printf("\nsum: %.4f\n", sum);
+
+  ggml_free(ctx);
+  return 0;
+}