-
Notifications
You must be signed in to change notification settings - Fork 75
/
Copy pathexample.cc
127 lines (100 loc) · 3.86 KB
/
example.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <tokenizers_cpp.h>
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
using tokenizers::Tokenizer;
std::string LoadBytesFromFile(const std::string& path) {
std::ifstream fs(path, std::ios::in | std::ios::binary);
if (fs.fail()) {
std::cerr << "Cannot open " << path << std::endl;
exit(1);
}
std::string data;
fs.seekg(0, std::ios::end);
size_t size = static_cast<size_t>(fs.tellg());
fs.seekg(0, std::ios::beg);
data.resize(size);
fs.read(data.data(), size);
return data;
}
void PrintEncodeResult(const std::vector<int>& ids) {
std::cout << "tokens=[";
for (size_t i = 0; i < ids.size(); ++i) {
if (i != 0) std::cout << ", ";
std::cout << ids[i];
}
std::cout << "]" << std::endl;
}
void TestTokenizer(std::unique_ptr<Tokenizer> tok, bool print_vocab = false,
bool check_id_back = true) {
// Check #1. Encode and Decode
std::string prompt = "What is the capital of Canada?";
std::vector<int> ids = tok->Encode(prompt);
std::string decoded_prompt = tok->Decode(ids);
PrintEncodeResult(ids);
std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
assert(decoded_prompt == prompt);
// Check #2. IdToToken and TokenToId
std::vector<int32_t> ids_to_test = {0, 1, 2, 3, 32, 33, 34, 130, 131, 1000};
for (auto id : ids_to_test) {
auto token = tok->IdToToken(id);
auto id_new = tok->TokenToId(token);
std::cout << "id=" << id << ", token=\"" << token << "\", id_new=" << id_new << std::endl;
if (check_id_back) {
assert(id == id_new);
}
}
// Check #3. GetVocabSize
auto vocab_size = tok->GetVocabSize();
std::cout << "vocab_size=" << vocab_size << std::endl;
std::cout << std::endl;
}
// Sentencepiece tokenizer
// - dist/tokenizer.model
void SentencePieceTokenizerExample() {
std::cout << "Tokenizer: SentencePiece" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
// Read blob from file.
auto blob = LoadBytesFromFile("dist/tokenizer.model");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobSentencePiece(blob);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
TestTokenizer(std::move(tok), false, true);
}
// HF tokenizer
// - dist/tokenizer.json
void HuggingFaceTokenizerExample() {
std::cout << "Tokenizer: Huggingface" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
// Read blob from file.
auto blob = LoadBytesFromFile("dist/tokenizer.json");
// Note: all the current factory APIs takes in-memory blob as input.
// This gives some flexibility on how these blobs can be read.
auto tok = Tokenizer::FromBlobJSON(blob);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
TestTokenizer(std::move(tok), false, true);
}
// RWKV world tokenizer
// - dist/tokenizer_model
void RWKVWorldTokenizerExample() {
std::cout << "Tokenizer: RWKVWorld" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
auto tok = Tokenizer::FromBlobRWKVWorld("dist/tokenizer_model");
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Load time: " << duration << " ms" << std::endl;
// We cannot check id back for RWKVWorldTokenizer yet.
TestTokenizer(std::move(tok), false, false);
}
int main(int argc, char* argv[]) {
SentencePieceTokenizerExample();
HuggingFaceTokenizerExample();
RWKVWorldTokenizerExample();
}