Skip to content

Commit

Permalink
added qwen tokenizer test
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoffatt2 committed Dec 9, 2024
1 parent 25e397c commit 82ebac1
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions data/template/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
CustomTokenizer,
CharTokenizer,
CustomCharTokenizerWithByteFallback,
Qwen2Tokenizer
)
from argparse import Namespace
from rich.console import Console
Expand Down Expand Up @@ -208,6 +209,22 @@ def test_custom_char_tokenizer_with_byte_fallback(self):
if os.path.exists(args.custom_chars_file):
os.remove(args.custom_chars_file)

def test_qwen2_tokenizer(self):
args = Namespace(qwen2_model="qwen2_0p5b")
tokenizer = Qwen2Tokenizer(args)

# Tokenize
ids = tokenizer.tokenize(self.sample_text)
detokenized = tokenizer.detokenize(ids)

console.print("[input]Input:[/input]")
console.print(self.sample_text, style="input")
console.print("[output]Detokenized Output:[/output]")
console.print(detokenized, style="output")

# Assert that detokenized text matches original
self.assertEqual(self.sample_text, detokenized)

if __name__ == '__main__':
run_tests()

0 comments on commit 82ebac1

Please sign in to comment.