Skip to content


src: stashing code examples
Browse files Browse the repository at this point in the history
Just saving these incase I want them later but I'm not sure.
  • Loading branch information
danbev committed Aug 21, 2024
1 parent 5619c81 commit 2c58695
Show file tree
Hide file tree
Showing 2 changed files with 198 additions and 0 deletions.
31 changes: 31 additions & 0 deletions attention/src/
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_printoptions(sci_mode=False, precision=4)

gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

with torch.no_grad():
inputs = gpt2_tokenizer("Dan loves icecream", return_tensors="pt", add_special_tokens=False)
attentions = gpt2(inputs.input_ids, output_attentions=True).attentions

# get the attention scores computed by the first layer
# for the first input sequence in the batch
first_layer_attentions = attentions[0][0]

# print attention scores from the first head
print("GPT2 Attention Scores (Head 1):")

inputs = gpt2_tokenizer("Dan loves icecream but", return_tensors="pt", add_special_tokens=False)
attentions = gpt2(inputs.input_ids, output_attentions=True).attentions

# get the attention scores computed by the first layer
# for the first input sequence in the batch
first_layer_attentions = attentions[0][0]

# print attention scores from the first head
print("GPT2 Attention Scores (Head 1):")
167 changes: 167 additions & 0 deletions fundamentals/ggml/src/simple-backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"

#include <cstdio>
#include <vector>

struct simple_model {
struct ggml_tensor* a = nullptr;
struct ggml_tensor* b = nullptr;

// the backend to perform the computation (CPU)
ggml_backend_t backend = nullptr;

// the backend buffer to store the tensors data of a and b
ggml_backend_buffer_t buffer = nullptr;

// the context to define the tensor information (dimensions, size, memory address)
struct ggml_context* ctx = nullptr;

void load_model(simple_model& model,
float* a,
float* b,
int rows_a,
int cols_a,
int rows_b,
int cols_b) {
model.backend = ggml_backend_cpu_init();

int num_tensors = 2;

struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,

// create context
model.ctx = ggml_init(params);

// create tensors
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_a, rows_a);
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_b, rows_b);

// create a backend buffer (backend memory) and alloc the tensors from the context
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);

// load data from cpu memory to backend buffer
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));

// build the compute graph to perform a matrix multiplication
struct ggml_cgraph* build_graph(const simple_model& model) {
static size_t buf_size = ggml_tensor_overhead() *GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()

// create a temporary context to build the graph.
struct ggml_context* ctx = ggml_init(params);

struct ggml_cgraph* gf = ggml_new_graph(ctx);

// result = a*b^T (notice the implicit transpose of b)
struct ggml_tensor* result = ggml_mul_mat(ctx, model.a, model.b);

// build operations nodes
ggml_build_forward_expand(gf, result);

// delete the temporary context used to build the graph

return gf;

// compute with backend
struct ggml_tensor* compute(const simple_model& model, ggml_gallocr_t allocr) {
// reset the allocator to free all the memory allocated during the previous inference

struct ggml_cgraph* gf = build_graph(model);

// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);

int n_threads = 1; // number of threads to perform some operations with multi-threading

if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);

ggml_backend_graph_compute(model.backend, gf);

// in this case, the output tensor is the last one in the graph
return gf->nodes[gf->n_nodes - 1];

int main(void) {

const int rows_a = 2, cols_a = 2;

float matrix_A[rows_a * cols_a] = {
1, 2,
3, 4

const int rows_b = 3, cols_b = 2;
float matrix_B[rows_b * cols_b] = {
5, 8,
6, 9,
7, 10

simple_model model;
load_model(model, matrix_A, matrix_B, rows_a, cols_a, rows_b, cols_b);

// calculate the temporaly memory required to compute
ggml_gallocr_t allocr = nullptr;
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));

// create the worst case graph for memory usage estimation
struct ggml_cgraph * gf = build_graph(model);
ggml_gallocr_reserve(allocr, gf);
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);

fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);

// perform computation
struct ggml_tensor * result = compute(model, allocr);

// create a array to print result
std::vector<float> out_data(ggml_nelements(result));

// bring the data from the backend memory
ggml_backend_tensor_get(result,, 0, ggml_nbytes(result));

printf("mul mat (%d x %d) :\n[", (int) result->ne[0], (int) result->ne[1]);
for (int j = 0; j < result->ne[0] /* rows */; j++) {
if (j > 0) {

for (int i = 0; i < result->ne[1] /* cols */; i++) {
printf(" %.2f", out_data[i * result->ne[0] + j]);
printf(" ]\n");

// release backend memory used for computation

// free memory

// release backend memory and free backend
return 0;

0 comments on commit 2c58695

Please sign in to comment.