Skip to content

Commit

Permalink
Optimize the tokenizer for efficiency (microsoft#797)
Browse files Browse the repository at this point in the history
* optimize the tokenizer for efficiency

* fix the unit test failures.

* fix the api test case failures

* removed the unused code.

* More test cases fixings

* One more fixing

* fix macOS build issues

* refine the test

* add more diagnosis info.

* fix unit test in CI Linux

* fix the pp_api test failure
  • Loading branch information
wenbingl authored Aug 28, 2024
1 parent 2d044ad commit 2d02a68
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 78 deletions.
4 changes: 2 additions & 2 deletions .pipelines/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,14 @@ stages:
- bash: |
set -e -x -u
./build.sh -DOCOS_ENABLE_C_API=ON
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with API enabled and run tests
- bash: |
set -e -x -u
./build.sh -DOCOS_BUILD_PRESET=token_api_only -DOCOS_BUILD_SHARED_LIB=OFF
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with tokenizer API only enabled and run tests
Expand Down
2 changes: 1 addition & 1 deletion docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The package contains all custom operators and some Python scripts to manipulate
- no-azure: disable AzureOp kernel build in Python package.
- no-opencv: disable operators based on OpenCV in build.
- cc-debug: generate debug info for extensions binaries and disable C/C++ compiler optimization.
- pp_api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *`
- pp-api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *`
- cuda-archs: specify the CUDA architectures(like 70, 85, etc.), and the multiple values can be combined with semicolon. The default value is nvidia-smi util output of GPU-0
- ort\_pkg\_dir: specify ONNXRuntime package directory the extension project is depending on. This is helpful if you want to use some ONNXRuntime latest function which has not been involved in the official build

Expand Down
5 changes: 3 additions & 2 deletions onnxruntime_extensions/_hf_cvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def convert_json_vocab(hf_tokenizer):
model_dir = hf_tokenizer.name_or_path
else:
model_dir = os.path.dirname(vocab_file)
tokenizer_json = json.load(
open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8"))
f = open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")
tokenizer_json = json.load(f)
f.close()
# get vocab object from json file
vocab = tokenizer_json.get("model", {}).get("vocab", {})
sorted_merges = tokenizer_json.get("model", {}).get("merges", [])
Expand Down
6 changes: 4 additions & 2 deletions onnxruntime_extensions/pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from . import _extensions_pydll as _C
if not hasattr(_C, "delete_object"):
raise ImportError(
"onnxruntime_extensions is not built with pre-processing C API"
"onnxruntime_extensions is not built with pre-processing C API\n"
"To enable it, please build the package with --ortx-user-option=pp_api")

create_processor = _C.create_processor
Expand All @@ -24,6 +24,7 @@

class Tokenizer:
def __init__(self, tokenizer_dir):
self.tokenizer = None
if os.path.isdir(tokenizer_dir):
self.tokenizer = create_tokenizer(tokenizer_dir)
else:
Expand All @@ -41,7 +42,8 @@ def __init__(self, tokenizer_dir):
f"Downloaded HF file '{resolved_full_file}' cannot be found")
if (os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file)):
raise FileNotFoundError(
f"Downloaded HF files '{resolved_full_file}' and '{resolved_config_file}' are not in the same directory")
f"Downloaded HF files '{resolved_full_file}' "
f"and '{resolved_config_file}' are not in the same directory")

tokenizer_dir = os.path.dirname(resolved_full_file)
self.tokenizer = create_tokenizer(tokenizer_dir)
Expand Down
102 changes: 62 additions & 40 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ ustring RemoveConsecutiveSpaces(const ustring& input) {
KernelBpeTokenizer::KernelBpeTokenizer(const BpeModelConf& conf)
: bpe_conf_(conf) {
model_name_ = conf.name_ == nullptr ? "" : conf.name_;
CreateUnicodeByteEncoder();
};

OrtStatusPtr KernelBpeTokenizer::OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) {
Expand Down Expand Up @@ -175,12 +176,28 @@ uint32_t KernelBpeTokenizer::GetTokenId(const std::string& token) const {
return bbpe_tokenizer_->GetTokenId(token);
}

/*
Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454
Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes
we need to store the strings rather than their IDs, and thereby need a separate map.
*/
void KernelBpeTokenizer::CreateUnicodeByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++);
} else {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i);
}
}
}

std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
int64_t max_length,
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const {
std::vector<int64_t> res;
std::list<std::pair<uint32_t, uint32_t>> byte_list;

bool clean_up_spaces = false;
if (ModelName() == kModel_CLIP) {
Expand All @@ -191,10 +208,10 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
text = text.strip()
*/
ustring str = RemoveConsecutiveSpaces(input);
if (IsUnicodeSpace(str.front())) {
if (!str.empty() && IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
if (!str.empty() && IsUnicodeSpace(str.back())) {
str.pop_back();
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
Expand Down Expand Up @@ -274,24 +291,43 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
}
}

// Get byte encodings prior to performing BPE
byte_list.clear();

std::list<std::pair<uint32_t, uint32_t>> byte_list;
std::string token_bytes;
token_bytes.reserve(utf8_token.size() * 2);
size_t token_len = utf8_token.length();
size_t end_diff = 0;
if (clean_up_spaces) {
// Whitespace clean
utf8_token.erase(std::remove(utf8_token.begin(), utf8_token.end(), U' '), utf8_token.end());
token_len = utf8_token.length() - 1;
}

for (int i = 0; i < utf8_token.length(); i++) {
if (i == utf8_token.length() - 1) {
std::string boundary(1, utf8_token[i]);
byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(boundary + "</w>"), 1));
} else {
byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(utf8_token[i])], 1));
}
for (size_t i = 0; i < token_len; i++) {
token_bytes += unicode_byte_encoder_[static_cast<unsigned char>(utf8_token[i])];
}

if (clean_up_spaces) {
end_diff = token_bytes.length();
if (!utf8_token.empty()) {
token_bytes += unicode_byte_encoder_[static_cast<unsigned char>(utf8_token.back())];
token_bytes += "</w>";
}
end_diff = token_bytes.length() - end_diff;
}

auto id = bbpe_tokenizer_->GetTokenId(token_bytes);
if (id != bpe::kInvalidTokenId) {
byte_list.push_back(std::make_pair(id, ort_extensions::narrow<uint32_t>(utf8_token.size())));
} else {
for (char& cp : utf8_token) {
byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)], 1));
token_len = token_bytes.length();
for (size_t i = 0; i < token_len - end_diff; /* i++ */) {
size_t j = ustring::UTF8Len(token_bytes[i]);
byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(token_bytes.substr(i, j)), ort_extensions::narrow<uint32_t>(j)));
i += j;
}
if (end_diff > 0) {
byte_list.push_back(std::make_pair(
bbpe_tokenizer_->GetTokenId(token_bytes.substr(token_len - end_diff, end_diff)), ort_extensions::narrow<uint32_t>(end_diff)));
}
}

Expand Down Expand Up @@ -343,7 +379,6 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const {
std::vector<int64_t> res;
std::list<std::pair<uint32_t, uint32_t>> byte_list;

// Add BOS token to result
res.push_back(bos_token_id_);
Expand Down Expand Up @@ -379,7 +414,7 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
}

// Get byte encodings prior to performing BPE
byte_list.clear();
std::list<std::pair<uint32_t, uint32_t>> byte_list;

while (res.size() < max_length && char_pos < ustr.length()) {
auto chr = ustr[char_pos];
Expand Down Expand Up @@ -559,23 +594,6 @@ SpmTokenizer::SpmTokenizer()

JsonFastTokenizer::JsonFastTokenizer() : KernelBpeTokenizer(kGPT2Configuration) {}

/*
Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454
Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes
we need to store the strings rather than their IDs, and thereby need a separate map.
*/
void JsonFastTokenizer::CreateUnicodeByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++);
} else {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i);
}
}
}

std::string JsonFastTokenizer::TokenBytesToString(std::vector<uint8_t>& bytes) {
std::string result;
for (auto c : bytes) {
Expand Down Expand Up @@ -647,7 +665,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> byte_merges;

bbpe_tokenizer_ = std::make_unique<BpeModel>();
JsonFastTokenizer::CreateUnicodeByteEncoder();

for (const auto& item : bpe_ranks) {
std::vector<uint8_t> token = item.first;
Expand Down Expand Up @@ -714,13 +731,19 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
module_ifs >> tok_json;
} else {
ifs >> tok_json;
// doesn't work for json with nested objects
// auto decoders_node = tok_json.find("/decoder/decoders"_json_pointer);
auto decoders_node = tok_json.find("decoder");
if (decoders_node != tok_json.end()) {
decoders_node = decoders_node->find("decoders");
bool has_decoders_node = false;
auto decoders_node = tok_json.end();
auto decoder_node = tok_json.find("decoder");
if (decoder_node != tok_json.end()) {
decoders_node = decoder_node->find("decoders");
if (decoders_node != decoder_node->end()) {
has_decoders_node = true;
}
}

if (decoders_node->is_array()) {
if (has_decoders_node && decoders_node->is_array()) {
for(auto step = decoders_node->begin(); step != decoders_node->end(); ++step) {
std::string type = step->value("type", "");
if (type == "Replace") {
Expand All @@ -742,7 +765,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
bpe_conf_.get().GetSpecialTokens().c_str(),
bpe_conf_.get().spm_model_);
}


auto added_tokens = tok_json.find("added_tokens");
if (added_tokens != tok_json.end()) {
Expand Down
5 changes: 3 additions & 2 deletions operators/tokenizer/bpe_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ struct KernelBpeTokenizer {
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const;

void CreateUnicodeByteEncoder();

protected:
std::reference_wrapper<BpeModelConf const> bpe_conf_;
std::string model_name_;
Expand All @@ -60,6 +62,7 @@ struct KernelBpeTokenizer {

std::optional<bool> add_bos_token_;
std::optional<bool> add_eos_token_;
std::string unicode_byte_encoder_[256] = {};
};

struct GPT2Tokenizer : KernelBpeTokenizer {
Expand Down Expand Up @@ -122,10 +125,8 @@ class JsonFastTokenizer : public KernelBpeTokenizer {
bool tiktoken_ = false;

private:
void CreateUnicodeByteEncoder();
std::string TokenBytesToString(std::vector<uint8_t>& bytes);

BpeModelConf json_conf_;
std::vector<ort_extensions::bpe::AddedToken> added_tokens_;
std::string unicode_byte_encoder_[256] = {};
};
1 change: 1 addition & 0 deletions operators/tokenizer/bpe_streaming.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
ptrdiff_t z = ustring::ValidateUTF8(text);
if (z <= 0) {
text = text.substr(0, -z);
text += "\ufffd"; // bad utf-8 string
}

decoded_strings.emplace_back(std::move(text));
Expand Down
25 changes: 0 additions & 25 deletions operators/tokenizer/bpe_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ class BpeModel {

if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}

uint32_t index = 0;
Expand Down Expand Up @@ -142,8 +140,6 @@ class BpeModel {

if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}

uint32_t index = 0;
Expand Down Expand Up @@ -196,8 +192,6 @@ class BpeModel {

if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}

uint32_t index = 0;
Expand Down Expand Up @@ -336,8 +330,6 @@ class BpeModel {
}
}

const auto& ByteEncoder() const { return byte_encoder_; }

uint32_t GetTokenId(const std::string& key) const {
auto it = vocab_map_.find(key);
if (it != vocab_map_.end()) {
Expand Down Expand Up @@ -370,27 +362,10 @@ class BpeModel {
return (static_cast<uint64_t>(i1) << 32) | (i0 & 0xFFFFFFFFLL);
}

void CreateByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
/*
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
*/
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(index++));
} else {
byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(i));
}
}
}

private:
std::string end_of_word_suffix_;
std::map<uint64_t, BpeNode> bpe_rank_;

uint32_t byte_encoder_[256] = {};
std::unordered_map<std::string, uint32_t> vocab_map_;
std::vector<std::string> id2token_map_;

Expand Down
2 changes: 1 addition & 1 deletion shared/api/tokenizer_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ OrtxStatus TokenizerImpl::BatchDecode(const std::vector<span<extTokenId_t const>
if (!status.IsOk()) {
return status;
}
t_text.emplace_back(ts_output.AsScalar());
t_text.push_back(ts_output.AsScalar());
}
return {};
}
Expand Down
10 changes: 10 additions & 0 deletions test/pp_api_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,16 @@ TEST(OrtxTokenizerTest, CodeGenTokenizer) {
EXPECT_TRUE(status.IsOk());
// std::cout << out_text[0] << std::endl;
EXPECT_EQ(out_text[0], input[0]);

// 252 and the following ids cannot be decoded as a valid utf-8 string
std::vector<extTokenId_t> invalid_token_ids_span = {14675, 8466, 705, 252, 538, 5374, 82, 329, 4554};
std::vector<std::string> out_text1;
status = tokenizer->Detokenize({ort_extensions::span<const extTokenId_t>(invalid_token_ids_span)}, out_text1);
EXPECT_TRUE(status.IsOk());
EXPECT_EQ(out_text1.size(), 1);
std::string out_text_ref = out_text1.back();
std::cout << out_text_ref << std::endl;
EXPECT_EQ(out_text_ref.substr(out_text_ref.length() - 3, 3), "\ufffd");
}

TEST(OrtxTokenizerStreamTest, CodeGenTokenizer) {
Expand Down
Loading

0 comments on commit 2d02a68

Please sign in to comment.