Skip to content

Commit

Permalink
changed to chunk tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoffatt2 committed Dec 10, 2024
1 parent 1b99d58 commit 1ac1ef9
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion data/template/tokenizer_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,13 @@ def __init__(self, args):
self.special_tokens = self.tokenizer.special_tokens_map

def tokenize(self, data):
ids = self.tokenizer.encode(data, add_special_tokens=True)
print(f"Tokenizing data of size: {len(data)}")
chunk_size = 1024
ids = []
for i in range(0, len(data), chunk_size):
chunk = data[i:i + chunk_size]
ids.extend(self.tokenizer.encode(chunk, add_special_tokens=True))
print(f"Generated {len(ids)} token IDs.")
meta = {
"vocab_size": self.vocab_size,
"tokenizer": "qwen2",
Expand Down

0 comments on commit 1ac1ef9

Please sign in to comment.