Skip to content

Commit

Permalink
add data rank split
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Dec 4, 2024
1 parent 92fa657 commit c8b2664
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 3 deletions.
2 changes: 2 additions & 0 deletions configs/10B/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data
dataset_ratio = "55:10:20:10:5"
num_workers = 4
reverse_data_files = true
split_by_data_rank = false # the 10b training assume that data was already split by datarank. Keeping this for backward compatibility


[diloco]
inner_steps = 100
Expand Down
1 change: 1 addition & 0 deletions configs/10B/H100_cooldown.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data
dataset_ratio = "80:10:10"
num_workers = 4
reverse_data_files = false
split_by_data_rank = false # the 10b training assume that data was already split by datarank. Keeping this for backward compatibility

[diloco]
inner_steps = 100
Expand Down
21 changes: 18 additions & 3 deletions src/zeroband/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class DataConfig(BaseConfig):
data_rank: Optional[int] = None
data_world_size: Optional[int] = None
reverse_data_files: bool = False
split_by_data_rank: bool = True


class FakeTokenizedDataset(IterableDataset):
Expand Down Expand Up @@ -393,14 +394,28 @@ def _get_probabilities(data_config: DataConfig) -> Optional[List[float]]:


def load_all_datasets(
data_config: DataConfig, split: str, tokenizer: PreTrainedTokenizer, rank: int, world_size: int
data_config: DataConfig,
split: str,
tokenizer: PreTrainedTokenizer,
rank: int,
world_size: int,
) -> InterleaveDataset:
"""Load all datasets and interleave them"""

if data_config.split_by_data_rank and (
data_config.data_rank is not None and data_config.data_world_size is not None
):
split_rank = data_config.data_rank * world_size + rank
split_world_size = data_config.data_world_size * world_size
else:
split_rank = rank
split_world_size = world_size

ds = _load_datasets(
dataset_names=data_config.dataset_name_or_paths,
split=split,
data_rank=rank,
data_world_size=world_size,
data_rank=split_rank,
data_world_size=split_world_size,
probabilities=_get_probabilities(data_config),
reverse_data_files=data_config.reverse_data_files,
tokenizer=tokenizer,
Expand Down

0 comments on commit c8b2664

Please sign in to comment.