Skip to content

Commit

Permalink
llama : arch (cont)
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Dec 22, 2024
1 parent 7ab08d5 commit 9ba2959
Show file tree
Hide file tree
Showing 7 changed files with 1,513 additions and 1,479 deletions.
7 changes: 1 addition & 6 deletions src/llama-adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
cvec.tensors.reserve(model.hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < model.hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
Expand Down
1,392 changes: 1,392 additions & 0 deletions src/llama-arch.cpp

Large diffs are not rendered by default.

1,323 changes: 58 additions & 1,265 deletions src/llama-arch.h

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct llama_data_write {
}

void write_model_info(const struct llama_context * ctx) {
std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
const std::string arch_str = llm_arch_name(ctx->model.arch);
write_string(arch_str);
// TODO: add more model-specific info which should prevent loading the session file if not identical
}
Expand Down Expand Up @@ -263,7 +263,8 @@ struct llama_data_read {

// validate model information
void read_model_info(const struct llama_context * ctx) {
std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
const std::string cur_arch_str = llm_arch_name(ctx->model.arch);

std::string arch_str;
read_string(arch_str);
if (cur_arch_str != arch_str) {
Expand Down
48 changes: 48 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "llama-model.h"

#include "llama-impl.h"

std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
Expand Down Expand Up @@ -42,3 +44,49 @@ std::string llama_model_ftype_name(llama_ftype ftype) {
default: return "unknown, may not work";
}
}

template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error(format("failed to create ggml context"));
}

ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
assert(op_tensor->src[i]->buffer == nullptr);
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);

return op_supported;
}

template<typename F>
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error(format("no suitable buffer type found"));
}

ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
}
40 changes: 1 addition & 39 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
#include "llama-vocab.h"
#include "llama-mmap.h"

#include "llama-impl.h"

#include "ggml-cpp.h"

#include <array>
Expand Down Expand Up @@ -613,42 +611,6 @@ struct llama_model {
}
};

template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error(format("failed to create ggml context"));
}

ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
assert(op_tensor->src[i]->buffer == nullptr);
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);

return op_supported;
}

template<typename F>
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error(format("no suitable buffer type found"));
}

ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);

std::string llama_model_ftype_name(llama_ftype ftype);
Loading

0 comments on commit 9ba2959

Please sign in to comment.