matsuolab · kaiyama12345679 · Aug 1, 2024 · Aug 1, 2024
diff --git a/bench.py b/bench.py
@@ -7,111 +7,99 @@
 import time
 import torch
 from model import GPTConfig, GPT
+import hydra
+from config.dataclass import Config
 
-# -----------------------------------------------------------------------------
-batch_size = 12
-block_size = 1024
-bias = False
-real_data = True
-seed = 1337
-device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
 dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
-compile = True # use PyTorch 2.0 to compile the model to be faster
-profile = False # use pytorch profiler, or just simple benchmarking?
-exec(open('configurator.py').read()) # overrides from command line or config file
-# -----------------------------------------------------------------------------
 
-torch.manual_seed(seed)
-torch.cuda.manual_seed(seed)
-torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
-torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
-device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
-ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+@hydra.main(config_path="config", config_name="bench")
+def main(cfg: Config) -> None:
+    device_type = 'cuda' if 'cuda' in cfg.device else 'cpu' # for later use in torch.autocast
+    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+    ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+    # data loading init
+    if cfg.real_data:
+        dataset = 'openwebtext'
+        data_dir = os.path.join('data', dataset)
+        train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
+        def get_batch(split):
+            data = train_data # note ignore split in benchmarking script
+            ix = torch.randint(len(data) - cfg.block_size, (cfg.batch_size,))
+            x = torch.stack([torch.from_numpy((data[i:i+cfg.block_size]).astype(np.int64)) for i in ix])
+            y = torch.stack([torch.from_numpy((data[i+1:i+1+cfg.block_size]).astype(np.int64)) for i in ix])
+            x, y = x.pin_memory().to(cfg.device, non_blocking=True), y.pin_memory().to(cfg.device, non_blocking=True)
+            return x, y
+    else:
+        # alternatively, if fixed data is desired to not care about data loading
+        x = torch.randint(50304, (cfg.batch_size, cfg.block_size), device=cfg.device)
+        y = torch.randint(50304, (cfg.batch_size, cfg.block_size), device=cfg.device)
+        get_batch = lambda split: (x, y)
 
-# data loading init
-if real_data:
-    dataset = 'openwebtext'
-    data_dir = os.path.join('data', dataset)
-    train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
-    def get_batch(split):
-        data = train_data # note ignore split in benchmarking script
-        ix = torch.randint(len(data) - block_size, (batch_size,))
-        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
-        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
-        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
-        return x, y
-else:
-    # alternatively, if fixed data is desired to not care about data loading
-    x = torch.randint(50304, (batch_size, block_size), device=device)
-    y = torch.randint(50304, (batch_size, block_size), device=device)
-    get_batch = lambda split: (x, y)
+    # model init
+    gptconf = GPTConfig(
+        block_size = cfg.block_size, # how far back does the model look? i.e. context size
+        n_layer = 12, n_head = 12, n_embd = 768, # size of the model
+        dropout = 0, # for determinism
+        bias = cfg.bias,
+    )
+    model = GPT(gptconf)
+    model.to(cfg.device)
 
-# model init
-gptconf = GPTConfig(
-    block_size = block_size, # how far back does the model look? i.e. context size
-    n_layer = 12, n_head = 12, n_embd = 768, # size of the model
-    dropout = 0, # for determinism
-    bias = bias,
-)
-model = GPT(gptconf)
-model.to(device)
+    optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
 
-optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
+    if compile:
+        print("Compiling model...")
+        model = torch.compile(model) # pytorch 2.0
 
-if compile:
-    print("Compiling model...")
-    model = torch.compile(model) # pytorch 2.0
+    if cfg.profile:
+        # useful docs on pytorch profiler:
+        # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
+        # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
+        wait, warmup, active = 5, 5, 5
+        num_steps = wait + warmup + active
+        with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+            schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
+            on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
+            record_shapes=False,
+            profile_memory=False,
+            with_stack=False, # incurs an additional overhead, disable if not needed
+            with_flops=True,
+            with_modules=False, # only for torchscript models atm
+        ) as prof:
 
-if profile:
-    # useful docs on pytorch profiler:
-    # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
-    # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
-    wait, warmup, active = 5, 5, 5
-    num_steps = wait + warmup + active
-    with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-        schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
-        record_shapes=False,
-        profile_memory=False,
-        with_stack=False, # incurs an additional overhead, disable if not needed
-        with_flops=True,
-        with_modules=False, # only for torchscript models atm
-    ) as prof:
-
-        X, Y = get_batch('train')
-        for k in range(num_steps):
-            with ctx:
-                logits, loss = model(X, Y)
             X, Y = get_batch('train')
-            optimizer.zero_grad(set_to_none=True)
-            loss.backward()
-            optimizer.step()
-            lossf = loss.item()
-            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+            for k in range(num_steps):
+                with ctx:
+                    logits, loss = model(X, Y)
+                X, Y = get_batch('train')
+                optimizer.zero_grad(set_to_none=True)
+                loss.backward()
+                optimizer.step()
+                lossf = loss.item()
+                print(f"{k}/{num_steps} loss: {lossf:.4f}")
 
-            prof.step() # notify the profiler at end of each step
+                prof.step() # notify the profiler at end of each step
 
-else:
+    else:
 
-    # simple benchmarking
-    torch.cuda.synchronize()
-    for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
-        t0 = time.time()
-        X, Y = get_batch('train')
-        for k in range(num_steps):
-            with ctx:
-                logits, loss = model(X, Y)
-            X, Y = get_batch('train')
-            optimizer.zero_grad(set_to_none=True)
-            loss.backward()
-            optimizer.step()
-            lossf = loss.item()
-            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+        # simple benchmarking
         torch.cuda.synchronize()
-        t1 = time.time()
-        dt = t1-t0
-        mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
-        if stage == 1:
-            print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
+        for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
+            t0 = time.time()
+            X, Y = get_batch('train')
+            for k in range(num_steps):
+                with ctx:
+                    logits, loss = model(X, Y)
+                X, Y = get_batch('train')
+                optimizer.zero_grad(set_to_none=True)
+                loss.backward()
+                optimizer.step()
+                lossf = loss.item()
+                print(f"{k}/{num_steps} loss: {lossf:.4f}")
+            torch.cuda.synchronize()
+            t1 = time.time()
+            dt = t1-t0
+            mfu = model.estimate_mfu(cfg.batch_size * 1 * num_steps, dt)
+            if stage == 1:
+                print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
diff --git a/config/bench.yaml b/config/bench.yaml
@@ -0,0 +1,8 @@
+batch_size: 12
+block_size: 1024
+bias: false
+real_data: true
+seed: 1337
+device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+compile: true # use PyTorch 2.0 to compile the model to be faster
+profile: false # use pytorch profiler, or just simple bench
diff --git a/config/dataclass.py b/config/dataclass.py
@@ -0,0 +1,60 @@
+#dataclass
+from dataclasses import dataclass
+
+@dataclass
+class Config:
+    # General settings
+    out_dir: str = 'out'
+    eval_interval: int = 2000
+    log_interval: int = 1
+    eval_iters: int = 200
+    eval_only: bool = False # if True, script exits right after the first eval
+    always_save_checkpoint: bool = True # if True, always save a checkpoint after each eval
+    init_from: str = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
+
+    # wandb logging
+    wandb_log: bool = False # disabled by default
+    wandb_project: str = 'owt'
+    wandb_run_name: str = 'gpt2' # 'run' + str(time.time())
+
+    # Data settings
+    dataset: str = 'openwebtext'
+    gradient_accumulation_steps: int = 40 # used to simulate larger batch sizes (5 * 8)
+    batch_size: int = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
+    block_size: int = 1024
+
+    # Model settings
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+    bias: bool = False # do we use bias inside LayerNorm and Linear layers?
+
+    # AdamW optimizer settings
+    learning_rate: float = 0.0006 # max learning rate (6e-4)
+    max_iters: int = 600000 # total number of training iterations
+    weight_decay: float = 0.1
+    beta1: float = 0.9
+    beta2: float = 0.95
+    grad_clip: float = 1.0 # clip gradients at this value, or disable if == 0.0
+
+    # Learning rate decay settings
+    decay_lr: bool = True # whether to decay the learning rate
+    warmup_iters: int = 2000 # how many steps to warm up for
+    lr_decay_iters: int = 600000 # should be ~= max_iters per Chinchilla
+    min_lr: float = 0.00006 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+
+    # DDP settings
+    ddp: str = 'torch'
+    backend: str = 'nccl' # 'nccl', 'gloo', etc.
+    start: str = "\n"  # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt"
+    num_samples: int = 10  # number of samples to draw
+    max_new_tokens: int = 500  # number of tokens generated in each sample
+    temperature: float = 0.8  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+    top_k: int = 200  # retain only the top_k most likely tokens, clamp others to have 0 probability
+    bias: bool = False
+    real_data: bool = True
+    seed: int = 1337
+    compile: bool = True  # use PyTorch 2.0 to compile the model to be faster
+    profile: bool = False  # use pytorch profiler, or just simple bench
+    device: str = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
diff --git a/config/eval_gpt2.py b/config/eval_gpt2.py
diff --git a/config/eval_gpt2_large.py b/config/eval_gpt2_large.py
diff --git a/config/eval_gpt2_medium.py b/config/eval_gpt2_medium.py
diff --git a/config/eval_gpt2_xl.py b/config/eval_gpt2_xl.py
diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py
diff --git a/config/sample.yaml b/config/sample.yaml
@@ -0,0 +1,9 @@
+init_from: 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
+out_dir: 'out' # ignored if init_from is not 'resume'
+start: "\n" # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt"
+num_samples: 10 # number of samples to draw
+max_new_tokens: 500 # number of tokens generated in each sample
+temperature: 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+top_k: 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
+seed: 1337
+device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
diff --git a/config/train.yaml b/config/train.yaml
@@ -0,0 +1,39 @@
+out_dir: 'out'
+eval_interval: 2000
+log_interval: 1
+eval_iters: 200
+eval_only: false # if True, script exits right after the first eval
+always_save_checkpoint: true # if True, always save a checkpoint after each eval
+init_from: 'scratch' # 'scratch' or 'resume' or 'gpt2*'
+# wandb logging
+wandb_log: false # disabled by default
+wandb_project: 'owt'
+wandb_run_name: 'gpt2' # 'run' + str(time.time())
+# data
+dataset: 'openwebtext'
+gradient_accumulation_steps: 40 # used to simulate larger batch sizes (5 * 8)
+batch_size: 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
+block_size: 1024
+# model
+n_layer: 12
+n_head: 12
+n_embd: 768
+dropout: 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias: false # do we use bias inside LayerNorm and Linear layers?
+# adamw optimizer
+learning_rate: 0.0006 # max learning rate (6e-4)
+max_iters: 600000 # total number of training iterations
+weight_decay: 0.1
+beta1: 0.9
+beta2: 0.95
+grad_clip: 1.0 # clip gradients at this value, or disable if == 0.0
+# learning rate decay settings
+decay_lr: true # whether to decay the learning rate
+warmup_iters: 2000 # how many steps to warm up for
+lr_decay_iters: 600000 # should be ~= max_iters per Chinchilla
+min_lr: 0.00006 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+# DDP settings
+ddp: 'torch'
+backend: 'nccl' # 'nccl', 'gloo', etc.
+# system
+device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
diff --git a/config/train_config/eval_gpt2.yaml b/config/train_config/eval_gpt2.yaml
@@ -0,0 +1,10 @@
+# evaluate the base gpt2
+# n_layer=12, n_head=12, n_embd=768
+# 124M parameters
+defaults:
+  - ../train
+batch_size: 8
+eval_iters: 500 # use more iterations to get good estimate
+eval_only: True
+wandb_log: False
+init_from: 'gpt2'