Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize tokenizer initialization in LazySupervisedDataset for QWEN a… #288

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions llava/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,6 @@ def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_im
roles = {"human": "user", "gpt": "assistant"}

# Add image tokens to tokenizer as a special tokens
# Use a deepcopy of tokenizer so that we don't modify on the tokenizer
tokenizer = copy.deepcopy(tokenizer)
# When there is actually an image, we add the image tokens as a special token
if has_image:
tokenizer.add_tokens(["<image>"], special_tokens=True)
Expand Down Expand Up @@ -644,8 +642,6 @@ def preprocess_llama3(
roles = {"human": "user", "gpt": "assistant"}

# Add image tokens to tokenizer as a special tokens
# Use a deepcopy of tokenizer so that we don't modify on the tokenizer
tokenizer = copy.deepcopy(tokenizer)
# When there is actually an image, we add the image tokens as a special token
if has_image:
tokenizer.add_tokens(["<image>"], special_tokens=True)
Expand Down Expand Up @@ -1031,7 +1027,7 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer,

rank0_print(f"Loaded {len(self.list_data_dict)} samples from {data_path}")
rank0_print("Formatting inputs...Skip in lazy mode")
self.tokenizer = tokenizer
self.tokenizer = copy.deepcopy(tokenizer) if conversation_lib.default_conversation.version in ["qwen","llama_v3"] else tokenizer
self.data_args = data_args

def __len__(self):
Expand Down