From 2d02a687beb1ba10319dc381b3907c91ab370995 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:57:50 -0700 Subject: [PATCH] Optimize the tokenizer for efficiency (#797) * optimize the tokenizer for efficiency * fix the unit test failures. * fix the api test case failures * removed the unused code. * More test cases fixings * One more fixing * fix macOS build issues * refine the test * add more diagnosis info. * fix unit test in CI Linux * fix the pp_api test failure --- .pipelines/ci.yml | 4 +- docs/development.md | 2 +- onnxruntime_extensions/_hf_cvt.py | 5 +- onnxruntime_extensions/pp_api.py | 6 +- operators/tokenizer/bpe_kernels.cc | 102 ++++++++++++++++---------- operators/tokenizer/bpe_kernels.h | 5 +- operators/tokenizer/bpe_streaming.hpp | 1 + operators/tokenizer/bpe_tokenizer.hpp | 25 ------- shared/api/tokenizer_impl.cc | 2 +- test/pp_api_test/test_tokenizer.cc | 10 +++ test/test_fast_tokenizer.py | 7 +- 11 files changed, 91 insertions(+), 78 deletions(-) diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index f3fb9ed4b..e39ef96bd 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -198,14 +198,14 @@ stages: - bash: | set -e -x -u ./build.sh -DOCOS_ENABLE_C_API=ON - cd out/Linux + cd out/Linux/RelWithDebInfo ctest -C RelWithDebInfo --output-on-failure displayName: Build ort-extensions with API enabled and run tests - bash: | set -e -x -u ./build.sh -DOCOS_BUILD_PRESET=token_api_only -DOCOS_BUILD_SHARED_LIB=OFF - cd out/Linux + cd out/Linux/RelWithDebInfo ctest -C RelWithDebInfo --output-on-failure displayName: Build ort-extensions with tokenizer API only enabled and run tests diff --git a/docs/development.md b/docs/development.md index 2ffb76895..cb9709e9c 100644 --- a/docs/development.md +++ b/docs/development.md @@ -16,7 +16,7 @@ The package contains all custom operators and some Python scripts to manipulate - no-azure: disable AzureOp kernel build in Python package. - no-opencv: disable operators based on OpenCV in build. - cc-debug: generate debug info for extensions binaries and disable C/C++ compiler optimization. - - pp_api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *` + - pp-api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *` - cuda-archs: specify the CUDA architectures(like 70, 85, etc.), and the multiple values can be combined with semicolon. The default value is nvidia-smi util output of GPU-0 - ort\_pkg\_dir: specify ONNXRuntime package directory the extension project is depending on. This is helpful if you want to use some ONNXRuntime latest function which has not been involved in the official build diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py index 3f9bcfeb4..c1aa2e6e7 100644 --- a/onnxruntime_extensions/_hf_cvt.py +++ b/onnxruntime_extensions/_hf_cvt.py @@ -48,8 +48,9 @@ def convert_json_vocab(hf_tokenizer): model_dir = hf_tokenizer.name_or_path else: model_dir = os.path.dirname(vocab_file) - tokenizer_json = json.load( - open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")) + f = open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8") + tokenizer_json = json.load(f) + f.close() # get vocab object from json file vocab = tokenizer_json.get("model", {}).get("vocab", {}) sorted_merges = tokenizer_json.get("model", {}).get("merges", []) diff --git a/onnxruntime_extensions/pp_api.py b/onnxruntime_extensions/pp_api.py index f0e127716..f30b742fd 100644 --- a/onnxruntime_extensions/pp_api.py +++ b/onnxruntime_extensions/pp_api.py @@ -7,7 +7,7 @@ from . import _extensions_pydll as _C if not hasattr(_C, "delete_object"): raise ImportError( - "onnxruntime_extensions is not built with pre-processing C API" + "onnxruntime_extensions is not built with pre-processing C API\n" "To enable it, please build the package with --ortx-user-option=pp_api") create_processor = _C.create_processor @@ -24,6 +24,7 @@ class Tokenizer: def __init__(self, tokenizer_dir): + self.tokenizer = None if os.path.isdir(tokenizer_dir): self.tokenizer = create_tokenizer(tokenizer_dir) else: @@ -41,7 +42,8 @@ def __init__(self, tokenizer_dir): f"Downloaded HF file '{resolved_full_file}' cannot be found") if (os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file)): raise FileNotFoundError( - f"Downloaded HF files '{resolved_full_file}' and '{resolved_config_file}' are not in the same directory") + f"Downloaded HF files '{resolved_full_file}' " + f"and '{resolved_config_file}' are not in the same directory") tokenizer_dir = os.path.dirname(resolved_full_file) self.tokenizer = create_tokenizer(tokenizer_dir) diff --git a/operators/tokenizer/bpe_kernels.cc b/operators/tokenizer/bpe_kernels.cc index 7c34c34bf..09f973771 100644 --- a/operators/tokenizer/bpe_kernels.cc +++ b/operators/tokenizer/bpe_kernels.cc @@ -106,6 +106,7 @@ ustring RemoveConsecutiveSpaces(const ustring& input) { KernelBpeTokenizer::KernelBpeTokenizer(const BpeModelConf& conf) : bpe_conf_(conf) { model_name_ = conf.name_ == nullptr ? "" : conf.name_; + CreateUnicodeByteEncoder(); }; OrtStatusPtr KernelBpeTokenizer::OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) { @@ -175,12 +176,28 @@ uint32_t KernelBpeTokenizer::GetTokenId(const std::string& token) const { return bbpe_tokenizer_->GetTokenId(token); } +/* +Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454 + +Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes +we need to store the strings rather than their IDs, and thereby need a separate map. +*/ +void KernelBpeTokenizer::CreateUnicodeByteEncoder() { + char32_t index = 256; + for (char32_t i = 0; i < 256; ++i) { + if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) { + unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++); + } else { + unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i); + } + } +} + std::vector KernelBpeTokenizer::Tokenize(ustring& input, int64_t max_length, bool compute_offset_mapping, std::list& offset_map) const { std::vector res; - std::list> byte_list; bool clean_up_spaces = false; if (ModelName() == kModel_CLIP) { @@ -191,10 +208,10 @@ std::vector KernelBpeTokenizer::Tokenize(ustring& input, text = text.strip() */ ustring str = RemoveConsecutiveSpaces(input); - if (IsUnicodeSpace(str.front())) { + if (!str.empty() && IsUnicodeSpace(str.front())) { str.erase(str.begin()); } - if (IsUnicodeSpace(str.back())) { + if (!str.empty() && IsUnicodeSpace(str.back())) { str.pop_back(); } // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned) @@ -274,24 +291,43 @@ std::vector KernelBpeTokenizer::Tokenize(ustring& input, } } - // Get byte encodings prior to performing BPE - byte_list.clear(); - + std::list> byte_list; + std::string token_bytes; + token_bytes.reserve(utf8_token.size() * 2); + size_t token_len = utf8_token.length(); + size_t end_diff = 0; if (clean_up_spaces) { // Whitespace clean utf8_token.erase(std::remove(utf8_token.begin(), utf8_token.end(), U' '), utf8_token.end()); + token_len = utf8_token.length() - 1; + } - for (int i = 0; i < utf8_token.length(); i++) { - if (i == utf8_token.length() - 1) { - std::string boundary(1, utf8_token[i]); - byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(boundary + ""), 1)); - } else { - byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast(utf8_token[i])], 1)); - } + for (size_t i = 0; i < token_len; i++) { + token_bytes += unicode_byte_encoder_[static_cast(utf8_token[i])]; + } + + if (clean_up_spaces) { + end_diff = token_bytes.length(); + if (!utf8_token.empty()) { + token_bytes += unicode_byte_encoder_[static_cast(utf8_token.back())]; + token_bytes += ""; } + end_diff = token_bytes.length() - end_diff; + } + + auto id = bbpe_tokenizer_->GetTokenId(token_bytes); + if (id != bpe::kInvalidTokenId) { + byte_list.push_back(std::make_pair(id, ort_extensions::narrow(utf8_token.size()))); } else { - for (char& cp : utf8_token) { - byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast(cp)], 1)); + token_len = token_bytes.length(); + for (size_t i = 0; i < token_len - end_diff; /* i++ */) { + size_t j = ustring::UTF8Len(token_bytes[i]); + byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(token_bytes.substr(i, j)), ort_extensions::narrow(j))); + i += j; + } + if (end_diff > 0) { + byte_list.push_back(std::make_pair( + bbpe_tokenizer_->GetTokenId(token_bytes.substr(token_len - end_diff, end_diff)), ort_extensions::narrow(end_diff))); } } @@ -343,7 +379,6 @@ std::vector KernelBpeTokenizer::SpmTokenize(ustring& input, bool compute_offset_mapping, std::list& offset_map) const { std::vector res; - std::list> byte_list; // Add BOS token to result res.push_back(bos_token_id_); @@ -379,7 +414,7 @@ std::vector KernelBpeTokenizer::SpmTokenize(ustring& input, } // Get byte encodings prior to performing BPE - byte_list.clear(); + std::list> byte_list; while (res.size() < max_length && char_pos < ustr.length()) { auto chr = ustr[char_pos]; @@ -559,23 +594,6 @@ SpmTokenizer::SpmTokenizer() JsonFastTokenizer::JsonFastTokenizer() : KernelBpeTokenizer(kGPT2Configuration) {} -/* -Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454 - -Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes -we need to store the strings rather than their IDs, and thereby need a separate map. -*/ -void JsonFastTokenizer::CreateUnicodeByteEncoder() { - char32_t index = 256; - for (char32_t i = 0; i < 256; ++i) { - if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) { - unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++); - } else { - unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i); - } - } -} - std::string JsonFastTokenizer::TokenBytesToString(std::vector& bytes) { std::string result; for (auto c : bytes) { @@ -647,7 +665,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c std::vector, std::vector, uint32_t>> byte_merges; bbpe_tokenizer_ = std::make_unique(); - JsonFastTokenizer::CreateUnicodeByteEncoder(); for (const auto& item : bpe_ranks) { std::vector token = item.first; @@ -714,13 +731,19 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c module_ifs >> tok_json; } else { ifs >> tok_json; + // doesn't work for json with nested objects // auto decoders_node = tok_json.find("/decoder/decoders"_json_pointer); - auto decoders_node = tok_json.find("decoder"); - if (decoders_node != tok_json.end()) { - decoders_node = decoders_node->find("decoders"); + bool has_decoders_node = false; + auto decoders_node = tok_json.end(); + auto decoder_node = tok_json.find("decoder"); + if (decoder_node != tok_json.end()) { + decoders_node = decoder_node->find("decoders"); + if (decoders_node != decoder_node->end()) { + has_decoders_node = true; + } } - if (decoders_node->is_array()) { + if (has_decoders_node && decoders_node->is_array()) { for(auto step = decoders_node->begin(); step != decoders_node->end(); ++step) { std::string type = step->value("type", ""); if (type == "Replace") { @@ -742,7 +765,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c bpe_conf_.get().GetSpecialTokens().c_str(), bpe_conf_.get().spm_model_); } - auto added_tokens = tok_json.find("added_tokens"); if (added_tokens != tok_json.end()) { diff --git a/operators/tokenizer/bpe_kernels.h b/operators/tokenizer/bpe_kernels.h index af56661f6..2bac5fb3d 100644 --- a/operators/tokenizer/bpe_kernels.h +++ b/operators/tokenizer/bpe_kernels.h @@ -48,6 +48,8 @@ struct KernelBpeTokenizer { bool compute_offset_mapping, std::list& offset_map) const; + void CreateUnicodeByteEncoder(); + protected: std::reference_wrapper bpe_conf_; std::string model_name_; @@ -60,6 +62,7 @@ struct KernelBpeTokenizer { std::optional add_bos_token_; std::optional add_eos_token_; + std::string unicode_byte_encoder_[256] = {}; }; struct GPT2Tokenizer : KernelBpeTokenizer { @@ -122,10 +125,8 @@ class JsonFastTokenizer : public KernelBpeTokenizer { bool tiktoken_ = false; private: - void CreateUnicodeByteEncoder(); std::string TokenBytesToString(std::vector& bytes); BpeModelConf json_conf_; std::vector added_tokens_; - std::string unicode_byte_encoder_[256] = {}; }; diff --git a/operators/tokenizer/bpe_streaming.hpp b/operators/tokenizer/bpe_streaming.hpp index 2d736f8b2..042b24e12 100644 --- a/operators/tokenizer/bpe_streaming.hpp +++ b/operators/tokenizer/bpe_streaming.hpp @@ -225,6 +225,7 @@ class BpeStreamingDecoder : public KernelBpeDecoder { ptrdiff_t z = ustring::ValidateUTF8(text); if (z <= 0) { text = text.substr(0, -z); + text += "\ufffd"; // bad utf-8 string } decoded_strings.emplace_back(std::move(text)); diff --git a/operators/tokenizer/bpe_tokenizer.hpp b/operators/tokenizer/bpe_tokenizer.hpp index ff5282a35..733d87049 100644 --- a/operators/tokenizer/bpe_tokenizer.hpp +++ b/operators/tokenizer/bpe_tokenizer.hpp @@ -61,8 +61,6 @@ class BpeModel { if (spm_converted) { UpdateSpmByteToken(vocab_map_); - } else { - CreateByteEncoder(); } uint32_t index = 0; @@ -142,8 +140,6 @@ class BpeModel { if (spm_converted) { UpdateSpmByteToken(vocab_map_); - } else { - CreateByteEncoder(); } uint32_t index = 0; @@ -196,8 +192,6 @@ class BpeModel { if (spm_converted) { UpdateSpmByteToken(vocab_map_); - } else { - CreateByteEncoder(); } uint32_t index = 0; @@ -336,8 +330,6 @@ class BpeModel { } } - const auto& ByteEncoder() const { return byte_encoder_; } - uint32_t GetTokenId(const std::string& key) const { auto it = vocab_map_.find(key); if (it != vocab_map_.end()) { @@ -370,27 +362,10 @@ class BpeModel { return (static_cast(i1) << 32) | (i0 & 0xFFFFFFFFLL); } - void CreateByteEncoder() { - char32_t index = 256; - for (char32_t i = 0; i < 256; ++i) { - /* - bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - ) - */ - if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) { - byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(index++)); - } else { - byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(i)); - } - } - } - private: std::string end_of_word_suffix_; std::map bpe_rank_; - uint32_t byte_encoder_[256] = {}; std::unordered_map vocab_map_; std::vector id2token_map_; diff --git a/shared/api/tokenizer_impl.cc b/shared/api/tokenizer_impl.cc index 7a0cbff5c..e24ab34b3 100644 --- a/shared/api/tokenizer_impl.cc +++ b/shared/api/tokenizer_impl.cc @@ -72,7 +72,7 @@ OrtxStatus TokenizerImpl::BatchDecode(const std::vector if (!status.IsOk()) { return status; } - t_text.emplace_back(ts_output.AsScalar()); + t_text.push_back(ts_output.AsScalar()); } return {}; } diff --git a/test/pp_api_test/test_tokenizer.cc b/test/pp_api_test/test_tokenizer.cc index 3c2f64cfd..76d70a462 100644 --- a/test/pp_api_test/test_tokenizer.cc +++ b/test/pp_api_test/test_tokenizer.cc @@ -290,6 +290,16 @@ TEST(OrtxTokenizerTest, CodeGenTokenizer) { EXPECT_TRUE(status.IsOk()); // std::cout << out_text[0] << std::endl; EXPECT_EQ(out_text[0], input[0]); + + // 252 and the following ids cannot be decoded as a valid utf-8 string + std::vector invalid_token_ids_span = {14675, 8466, 705, 252, 538, 5374, 82, 329, 4554}; + std::vector out_text1; + status = tokenizer->Detokenize({ort_extensions::span(invalid_token_ids_span)}, out_text1); + EXPECT_TRUE(status.IsOk()); + EXPECT_EQ(out_text1.size(), 1); + std::string out_text_ref = out_text1.back(); + std::cout << out_text_ref << std::endl; + EXPECT_EQ(out_text_ref.substr(out_text_ref.length() - 3, 3), "\ufffd"); } TEST(OrtxTokenizerStreamTest, CodeGenTokenizer) { diff --git a/test/test_fast_tokenizer.py b/test/test_fast_tokenizer.py index 053b45159..e494ddc57 100644 --- a/test/test_fast_tokenizer.py +++ b/test/test_fast_tokenizer.py @@ -32,7 +32,7 @@ def test_mistral(self): def test_phi_3_mini(self): tokenizer = AutoTokenizer.from_pretrained( - "microsoft/Phi-3-mini-128k-instruct", use_fast=True) + "microsoft/Phi-3-mini-128k-instruct", use_fast=True, add_bos_token=True, add_eos_token=False) text = ["what are you? \n 给 weiss ich, über was los ist \n", "@? \n was los ist \n", "Qué dijiste? \n über 给 ば was los ist im Mannschaft ц \n", @@ -44,9 +44,10 @@ def test_phi_3_mini(self): expected_ids = tokenizer.encode(text[n], return_tensors="np") try: np.testing.assert_array_equal( - expected_ids[0], actual_ids[n][1:expected_ids.shape[1] + 1]) + # skip the padding tokens in the ort output + expected_ids[0], actual_ids[n][:expected_ids.shape[1]]) except AssertionError: - print("index is ", n) + print("the failed sentence index is ", n) raise