src: add initial kv-cache exploration code

Signed-off-by: Daniel Bevenius <[email protected]>
danbev · Jun 23, 2024 · cfd208b · cfd208b
1 parent ed8a14e
commit cfd208b
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 0 deletions.
diff --git a/fundamentals/llama.cpp/Makefile b/fundamentals/llama.cpp/Makefile
@@ -41,6 +41,9 @@ OBJS = llama.cpp/common.o \
        llama.cpp/grammar-parser.o \
        llama.cpp/json-schema-to-grammar.o
 
+kv-cache: src/kv-cache.cpp
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(OBJS)
+
 simple-prompt: src/simple-prompt.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(OBJS)
 

diff --git a/fundamentals/llama.cpp/src/kv-cache.cpp b/fundamentals/llama.cpp/src/kv-cache.cpp
@@ -0,0 +1,47 @@
+#include "llama.h"
+
+#include <cstdio>
+#include <string>
+#include <cstdlib>
+#include <vector>
+
+int main(int argc, char** argv) {
+    fprintf(stdout, "llama.cpp KV-Cache exploration\n");
+    llama_model_params model_params = llama_model_default_params();
+
+    model_params.main_gpu = 0;
+    model_params.n_gpu_layers = 0;
+    std::string model_path = "models/llama-2-13b-chat.Q4_0.gguf";
+    fprintf(stdout, "llama.cpp example using model: %s\n", model_path.c_str());
+
+    llama_backend_init();
+
+    llama_model* model = llama_load_model_from_file(model_path.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: failed to to load model %s\n" , __func__, model_path.c_str());
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 1024;
+    ctx_params.n_threads = 4;
+    ctx_params.n_threads_batch = 4;
+    ctx_params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR;
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    struct llama_kv_cache_view kv_view =  llama_kv_cache_view_init(ctx, 1);
+    printf("kv_view n_cells: %d\n", kv_view.n_cells);
+    printf("kv_view n_max_seq: %d\n", kv_view.n_seq_max);
+
+    llama_free(ctx);
+    llama_free_model(model);
+    llama_backend_free();
+
+    return 0;
+}