From 048d34564b9bd73636ec82dca0f76931603a2ebe Mon Sep 17 00:00:00 2001 From: dixyes Date: Sun, 22 Dec 2024 14:50:59 +0800 Subject: [PATCH] Fix tokenizer_clean_spaces for megrez --- src/llama.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 926cd6dff0e38..e13bf89ed80f6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6604,8 +6604,7 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; vocab.tokenizer_clean_spaces = false; } else if ( - tokenizer_pre == "qwen2" || - tokenizer_pre == "megrez") { + tokenizer_pre == "qwen2") vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; vocab.tokenizer_clean_spaces = false; } else if ( @@ -6665,6 +6664,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "minerva-7b") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; + } else if ( + tokenizer_pre == "megrez") + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); }