PrimeIntellect-ai · samsja · Dec 4, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .vscode/*
 logs/*
 wandb/*
+datasets/*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -38,6 +38,15 @@ A research paper about the framework and our INTELLECT-1 10B experiment is comin
 
 ## Getting Started
 
+For an easy install that download the data
+
+```
+curl -sSL https://raw.githubusercontent.com/PrimeIntellect-ai/prime/scripts/install/install.sh | bash
+```
+
+step by step :
+
+
 1. Clone: 
 
 ```bash
@@ -67,19 +76,11 @@ git submodule update --init --recursive
 huggingface-cli login
 ```
 
-all steps:
-
+5. Download the data 
 ```
-git clone [email protected]:PrimeIntellect-ai/prime.git
-cd prime
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-sudo apt install iperf -y
-uv venv
-source .venv/bin/activate
-uv sync --extra all
-uv pip install flash-attn --no-build-isolation
-git submodule update --init --recursive
+mkdir -p datasets
+uv run python scripts/subset_data.py --dataset_name PrimeIntellect/fineweb-edu --data_world_size 1 --data_rank 0 --max_shards 32
+mv fineweb-edu/ datasets/fineweb-edu/
 ```
 
 
@@ -88,14 +89,14 @@ git submodule update --init --recursive
 Verify your setup:
 
 ```bash
-ZERO_BAND_LOG_LEVEL=DEBUG torchrun --nproc_per_node=2 src/zeroband/train.py @configs/debug/normal.toml
+GLOO_SOCKET_IFNAME=lo GLOBAL_ADDR=localhost GLOBAL_RANK=0 GLOBAL_UNIQUE_ID=0 GLOBAL_WORLD_SIZE=1 GLOBAL_PORT=8989  uv run torchrun --nproc_per_node=2 src/zeroband/train.py  @configs/debug/diloco.toml
 ```
 
 ## Usage
 
 ### Running DiLoCo
 
-To test DiLoCo locally you can use the helper script `scripts/simulatsimulate_multi_nodee_mutl.sh` 
+To test DiLoCo locally you can use the helper script `scripts/simulatsimulate_multi_nodee_mutl.sh`
 
 ```bash
 # Using 4 GPUs

diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
@@ -25,6 +25,8 @@ dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data
 dataset_ratio = "55:10:20:10:5"
 num_workers = 4
 reverse_data_files = true
+split_by_data_rank = false # the 10b training assume that data was already split by datarank. Keeping this for backward compatibility
+
 
 [diloco]
 inner_steps = 100

diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml
@@ -26,6 +26,7 @@ dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data
 dataset_ratio = "80:10:10"
 num_workers = 4
 reverse_data_files = false
+split_by_data_rank = false # the 10b training assume that data was already split by datarank. Keeping this for backward compatibility
 
 [diloco]
 inner_steps = 100

diff --git a/scripts/install/install.sh b/scripts/install/install.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Colors for output
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+main() {
+    # Check if sudo is installed
+    if ! command -v sudo &> /dev/null; then
+        apt update
+        apt install sudo -y
+    fi
+
+    log_info "Updating apt..."
+    sudo apt update
+
+    log_info "Installing cmake python3-dev..."
+    sudo apt install python3-dev cmake -y
+
+    log_info "Installing iperf..."
+    sudo apt install iperf -y
+
+    log_info "Cloning repository..."
+    git clone https://github.com/PrimeIntellect-ai/prime.git
+
+    log_info "Entering project directory..."
+    cd prime
+
+    log_info "Installing uv..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+
+    log_info "Sourcing uv environment..."
+    source $HOME/.local/bin/env
+
+    log_info "Creating virtual environment..."
+    uv venv
+
+    log_info "Activating virtual environment..."
+    source .venv/bin/activate
+
+    log_info "Installing dependencies..."
+    uv sync --extra all
+
+    log_info "Installing flash-attn..."
+    uv pip install flash-attn --no-build-isolation
+
+    log_info "Updating git submodules..."
+    git submodule update --init --recursive
+
+    log_info "Downloading data..."
+    mkdir -p datasets
+    uv run python scripts/subset_data.py --dataset_name PrimeIntellect/fineweb-edu --data_world_size 1 --data_rank 0 --max_shards 128
+    mv fineweb-edu/ datasets/fineweb-edu/
+
+    log_info "Installation completed! You can double check that everything is install correctly by running 'GLOO_SOCKET_IFNAME=lo GLOBAL_ADDR=localhost GLOBAL_RANK=0 GLOBAL_UNIQUE_ID=0 GLOBAL_WORLD_SIZE=1 GLOBAL_PORT=8989  uv run torchrun --nproc_per_node=2 src/zeroband/train.py  @configs/debug/diloco.toml'"
+}
+
+main
diff --git a/src/zeroband/data.py b/src/zeroband/data.py
@@ -25,7 +25,7 @@
 
 
 class DataConfig(BaseConfig):
-    dataset_name_or_paths: str = "/data/datasets/fineweb-edu"
+    dataset_name_or_paths: str = "datasets/fineweb-edu"
     val_dataset_name_or_paths: Optional[str] = None
     seq_length: int = 1024
     fake: bool = False
@@ -36,6 +36,7 @@ class DataConfig(BaseConfig):
     data_rank: Optional[int] = None
     data_world_size: Optional[int] = None
     reverse_data_files: bool = False
+    split_by_data_rank: bool = True
 
 
 class FakeTokenizedDataset(IterableDataset):
@@ -393,14 +394,28 @@ def _get_probabilities(data_config: DataConfig) -> Optional[List[float]]:
 
 
 def load_all_datasets(
-    data_config: DataConfig, split: str, tokenizer: PreTrainedTokenizer, rank: int, world_size: int
+    data_config: DataConfig,
+    split: str,
+    tokenizer: PreTrainedTokenizer,
+    rank: int,
+    world_size: int,
 ) -> InterleaveDataset:
     """Load all datasets and interleave them"""
+
+    if data_config.split_by_data_rank and (
+        data_config.data_rank is not None and data_config.data_world_size is not None
+    ):
+        split_rank = data_config.data_rank * world_size + rank
+        split_world_size = data_config.data_world_size * world_size
+    else:
+        split_rank = rank
+        split_world_size = world_size
+
     ds = _load_datasets(
         dataset_names=data_config.dataset_name_or_paths,
         split=split,
-        data_rank=rank,
-        data_world_size=world_size,
+        data_rank=split_rank,
+        data_world_size=split_world_size,
         probabilities=_get_probabilities(data_config),
         reverse_data_files=data_config.reverse_data_files,
         tokenizer=tokenizer,