Skip to content

Commit

Permalink
Fix tokenizer_clean_spaces for megrez
Browse files Browse the repository at this point in the history
  • Loading branch information
dixyes committed Dec 22, 2024
1 parent a02c63d commit 048d345
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6604,8 +6604,7 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "qwen2" ||
tokenizer_pre == "megrez") {
tokenizer_pre == "qwen2")
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
vocab.tokenizer_clean_spaces = false;
} else if (
Expand Down Expand Up @@ -6665,6 +6664,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else if (
tokenizer_pre == "megrez")
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down

0 comments on commit 048d345

Please sign in to comment.