Skip to content

Commit

Permalink
revert data changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackmin801 committed Sep 28, 2024
1 parent fa8d3dd commit 73800d9
Showing 1 changed file with 1 addition and 2 deletions.
3 changes: 1 addition & 2 deletions src/zeroband/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def get_dataloader(
ds = load_dataset("allenai/c4", "en", streaming=True)

def tokenize_function(data):
outputs = tokenizer(data["text"], truncation=True, max_length=seq_length, padding="max_length")
outputs = tokenizer(data["text"], truncation=True, max_length=seq_length)
return outputs

tokenized_datasets = ds.map(
Expand All @@ -78,7 +78,6 @@ def tokenize_function(data):
train_dataset = split_dataset_by_node(tokenized_datasets, world_size=world_size, rank=rank)

data_collator = collate_causal_mask(max_seq_length=seq_length, pad_id=tokenizer.pad_token_id, ignore_index=-100)
print(train_dataset, flush=True)

return DataLoader(
train_dataset,
Expand Down

0 comments on commit 73800d9

Please sign in to comment.