ggml: add initial RoPE example

Signed-off-by: Daniel Bevenius <[email protected]>
danbev · Aug 2, 2024 · df52f92 · df52f92
1 parent c4b82f2
commit df52f92
Showing 1 changed file with 132 additions and 0 deletions.
diff --git a/fundamentals/ggml/src/rope.c b/fundamentals/ggml/src/rope.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <math.h>
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+double calculate_n_rot(double x, double base, int max_pos_emb, int n_dims) {
+    const double pi = M_PI;
+    // Calculate the exponent
+    double exponent = (2.0 * max_pos_emb) / n_dims;
+    // Calculate base raised to the power of the exponent
+    double base_to_power = pow(base, exponent);
+    // Calculate the final result
+    double n_rot = 2 * pi * x * base_to_power;
+    return n_rot;
+}
+
+int main(int argc, char **argv) {
+  printf("GGML RoPE example\n");
+
+  struct ggml_init_params params = {
+    .mem_size   = 20000000,
+    .mem_buffer = NULL,
+  };
+  struct ggml_context* ctx = ggml_init(params);
+
+  // Simulate a sequence of 6 tokens with en embedding size of 4019 and a
+  // context length of 512. Keep in mind that this tensor is created to be
+  // used to calculate the coputation graph.
+  int n_ctx_orig = 4096;
+  int embd_dim = 128;
+  int n_head = 32;
+  int n_tokens = 6;
+
+  // The Query matrix in this case can hold 512 tokens each with a dimension
+  // of 4096.
+  struct ggml_tensor* query = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ctx_orig, n_tokens);
+
+  // We reshape the query matrix embedding dimensions to account for the number
+  // of heads (32) each which will have a dimension of 128 (128 * 32 = 4096).
+  struct ggml_tensor* a = ggml_reshape_3d(ctx, query, embd_dim, n_head, n_tokens);
+  ggml_set_name(a, "a");
+
+  // These are the positions 
+  struct ggml_tensor* pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+  ggml_set_name(pos, "pos");
+
+  // Set some made up values for the tensor to be rotated.
+  // First loop over the number of tokens in the batch (6) (skipping the actual
+  // loop for the batch here though.
+  for (int i = 0; i < a->ne[2]; i++) {
+      // Loop over the embedding heads (32)
+      for (int j = 0; j < a->ne[1]; j++) {
+          // Loop over the embedding dimensions (128)
+          for (int k = 0; k < a->ne[0]; k++) {
+              // TODO: make the value a random value.
+              //float value = 1.0f + k;
+              float value = 8.3f + k;
+              ggml_set_f32_nd(a, k, j, i, 0, value);
+          }
+      } 
+  }
+
+  // Print a few of the first dimensions so we can see that there is a rotation
+  // being performed. In this case we are printing the first 10 embeddings for
+  // the 4th token.
+  for (int i = 0; i < 10; i++) {
+    printf("embedding for token 4, embedding dim %d: %f\n", i, ggml_get_f32_nd(a, i, 0, 4, 0));
+  }
+
+  // Set the positions manually (the b tensor parameter to ggml_rope_ext).
+  for (int i = 0; i < pos->ne[0]; i++) {
+      ggml_set_i32_1d(pos, i, i);
+  }
+
+  int mode = 0;    // rote type 0 = Normal
+  // The RoPE base frequency
+  //   ↓
+  // (10000^(-2j/d).
+  float freq_base = 10000.0f;
+  // The RoPE frequency scale.
+  float freq_scale = 1.0f;
+  // TODO: What is this? It looks like this is mscale (magnituce scale)
+  float attn_factor = 1.0f;
+  // Extrapolation factor. If this is 0.0 then the beta_fast and beta_slow
+  // are not used. 
+  float ext_factor = 1.0f;
+  // This is a YaRN parameter which I think is named α in the YaRN paper.
+  float beta_fast = 32.0f;
+  // This is a YaRN parameter which I think is named β in the YaRN paper.
+  float beta_slow = 1.0f;
+  // RoPE Frequency factors are used with certan models like PHI. 
+  struct ggml_tensor* freq_factors = NULL;
+
+  struct ggml_tensor* s = ggml_rope_ext(ctx,
+                                        a,
+                                        pos,
+                                        freq_factors,
+                                        embd_dim,
+                                        mode,
+                                        n_ctx_orig,
+                                        freq_base,
+                                        freq_scale,
+                                        ext_factor,
+                                        attn_factor,
+                                        beta_fast,
+                                        beta_slow);
+
+  struct ggml_cgraph* c_graph = ggml_new_graph(ctx);
+  ggml_build_forward_expand(c_graph, s);
+
+
+  int n_threads = 4;
+  enum ggml_status status = ggml_graph_compute_with_ctx(ctx, c_graph, n_threads);
+  if (status != GGML_STATUS_SUCCESS) {
+    printf("Error: %s\n", ggml_status_to_string(status));
+    return 1;
+  }
+
+  struct ggml_tensor* r = ggml_reshape_2d(ctx, s, n_ctx_orig, n_tokens);
+
+  printf("embedding after rotation:\n");
+  //printf("Rotation: %f\n", *(float *)((char *)  s->data + 73728));
+
+  for (int i = 0; i < 10; i++) {
+    printf("embedding for token 4, embedding dim %d = %f\n", i, ggml_get_f32_nd(s, i, 0, 4, 0));
+  }
+
+  ggml_free(ctx);
+  return 0;
+}