-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
tokenize_file.cc
41 lines (33 loc) · 1.34 KB
/
tokenize_file.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#include "train.h"
#include "llama.h"
/*
This is a standalone example of using llama.cpp training tokenize_file
function. Is is only intended to be used to verify that that format of
training data is correct.
To inspect the samples, comment in the following line in train.cpp:
printf("sample: '%s'\n", buf_sample.data());
*/
int main() {
std::string training_data = "data/assistent-training.txt";
std::string model = "models/llama-2-7b-chat.gguf";
std::vector<llama_token> train_tokens;
std::vector<size_t> train_samples_begin;
std::vector<size_t> train_samples_size;
struct llama_model_params llama_mparams = llama_model_default_params();
llama_mparams.n_gpu_layers = 27;
llama_mparams.vocab_only = false;
struct llama_model* lmodel = llama_load_model_from_file(model.c_str(), llama_mparams);
struct llama_context_params llama_cparams = llama_context_default_params();
struct llama_context* lctx = llama_new_context_with_model(lmodel, llama_cparams);
size_t ret = tokenize_file(lctx,
training_data.c_str(),
"<s>", // sample start token
true, // include sample start
false, // overlapping
70, // n_tokens
train_tokens,
train_samples_begin,
train_samples_size);
printf("%s: tokenize_file returned %zu\n", __func__, ret);
return 0;
}