Skip to content

Commit

Permalink
llama : arch (cont)
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Dec 22, 2024
1 parent 7ab08d5 commit e1fc07a
Show file tree
Hide file tree
Showing 7 changed files with 1,349 additions and 1,325 deletions.
7 changes: 1 addition & 6 deletions src/llama-adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
cvec.tensors.reserve(model.hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < model.hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
Expand Down
1,247 changes: 1,247 additions & 0 deletions src/llama-arch.cpp

Large diffs are not rendered by default.

1,309 changes: 43 additions & 1,266 deletions src/llama-arch.h

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct llama_data_write {
}

void write_model_info(const struct llama_context * ctx) {
std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
const std::string arch_str = llm_arch_name(ctx->model.arch);
write_string(arch_str);
// TODO: add more model-specific info which should prevent loading the session file if not identical
}
Expand Down Expand Up @@ -263,7 +263,8 @@ struct llama_data_read {

// validate model information
void read_model_info(const struct llama_context * ctx) {
std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
const std::string cur_arch_str = llm_arch_name(ctx->model.arch);

std::string arch_str;
read_string(arch_str);
if (cur_arch_str != arch_str) {
Expand Down
48 changes: 48 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "llama-model.h"

#include "llama-impl.h"

std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
Expand Down Expand Up @@ -42,3 +44,49 @@ std::string llama_model_ftype_name(llama_ftype ftype) {
default: return "unknown, may not work";
}
}

template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error(format("failed to create ggml context"));
}

ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
assert(op_tensor->src[i]->buffer == nullptr);
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);

return op_supported;
}

template<typename F>
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error(format("no suitable buffer type found"));
}

ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
}
40 changes: 1 addition & 39 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
#include "llama-vocab.h"
#include "llama-mmap.h"

#include "llama-impl.h"

#include "ggml-cpp.h"

#include <array>
Expand Down Expand Up @@ -613,42 +611,6 @@ struct llama_model {
}
};

template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error(format("failed to create ggml context"));
}

ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
assert(op_tensor->src[i]->buffer == nullptr);
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);

return op_supported;
}

template<typename F>
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error(format("no suitable buffer type found"));
}

ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);

std::string llama_model_ftype_name(llama_ftype ftype);
18 changes: 6 additions & 12 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2962,7 +2962,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {

// hparams
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
Expand Down Expand Up @@ -17042,9 +17042,12 @@ int32_t llama_detokenize(
//

static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
return LLM_CHAT_TEMPLATES.at(tmpl);
try {
return llm_chat_template_from_str(tmpl);
} catch (const std::out_of_range &) {
// ignore
}

auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
return tmpl.find(haystack) != std::string::npos;
};
Expand Down Expand Up @@ -17535,15 +17538,6 @@ int32_t llama_chat_apply_template(
return res;
}

int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
auto it = LLM_CHAT_TEMPLATES.begin();
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
output[i] = it->first.c_str();
std::advance(it, 1);
}
return (int32_t) LLM_CHAT_TEMPLATES.size();
}

//
// sampling
//
Expand Down

0 comments on commit e1fc07a

Please sign in to comment.