From d3560a16df226bd7e3c05264025030009d1f0a1e Mon Sep 17 00:00:00 2001 From: kai-yamashita Date: Thu, 1 Aug 2024 14:12:18 +0900 Subject: [PATCH 1/2] refactor: add hydra and typing --- bench.py | 178 +++--- config/bench.yaml | 8 + config/eval_gpt2.py | 8 - config/eval_gpt2_large.py | 8 - config/eval_gpt2_medium.py | 8 - config/eval_gpt2_xl.py | 8 - config/finetune_shakespeare.py | 25 - config/sample.yaml | 9 + config/train.yaml | 39 ++ config/train_config/eval_gpt2.yaml | 10 + config/train_config/eval_gpt2_large.yaml | 10 + config/train_config/eval_gpt2_medium.yaml | 11 + config/train_config/eval_gpt2_xl.yaml | 10 + config/train_config/finetune_shakespeare.yaml | 26 + .../train_gpt2.yaml} | 26 +- .../train_config/train_shakespeare_char.yaml | 39 ++ config/train_shakespeare_char.py | 37 -- configurator.py | 47 -- model.py | 22 +- sample.py | 139 +++-- train.py | 524 +++++++++--------- utils.py | 10 + 22 files changed, 597 insertions(+), 605 deletions(-) create mode 100644 config/bench.yaml delete mode 100644 config/eval_gpt2.py delete mode 100644 config/eval_gpt2_large.py delete mode 100644 config/eval_gpt2_medium.py delete mode 100644 config/eval_gpt2_xl.py delete mode 100644 config/finetune_shakespeare.py create mode 100644 config/sample.yaml create mode 100644 config/train.yaml create mode 100644 config/train_config/eval_gpt2.yaml create mode 100644 config/train_config/eval_gpt2_large.yaml create mode 100644 config/train_config/eval_gpt2_medium.yaml create mode 100644 config/train_config/eval_gpt2_xl.yaml create mode 100644 config/train_config/finetune_shakespeare.yaml rename config/{train_gpt2.py => train_config/train_gpt2.yaml} (61%) create mode 100644 config/train_config/train_shakespeare_char.yaml delete mode 100644 config/train_shakespeare_char.py delete mode 100644 configurator.py create mode 100644 utils.py diff --git a/bench.py b/bench.py index 09d574a..7e2f995 100644 --- a/bench.py +++ b/bench.py @@ -7,111 +7,99 @@ import time import torch from model import GPTConfig, GPT +import hydra +from omegaconf import DictConfig -# ----------------------------------------------------------------------------- -batch_size = 12 -block_size = 1024 -bias = False -real_data = True -seed = 1337 -device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' -compile = True # use PyTorch 2.0 to compile the model to be faster -profile = False # use pytorch profiler, or just simple benchmarking? -exec(open('configurator.py').read()) # overrides from command line or config file -# ----------------------------------------------------------------------------- -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) -torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul -torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn -device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast -ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] -ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) +@hydra.main(config_path="config", config_name="bench") +def main(cfg: DictConfig) -> None: + device_type = 'cuda' if 'cuda' in cfg.device else 'cpu' # for later use in torch.autocast + ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] + ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) + # data loading init + if cfg.real_data: + dataset = 'openwebtext' + data_dir = os.path.join('data', dataset) + train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r') + def get_batch(split): + data = train_data # note ignore split in benchmarking script + ix = torch.randint(len(data) - cfg.block_size, (cfg.batch_size,)) + x = torch.stack([torch.from_numpy((data[i:i+cfg.block_size]).astype(np.int64)) for i in ix]) + y = torch.stack([torch.from_numpy((data[i+1:i+1+cfg.block_size]).astype(np.int64)) for i in ix]) + x, y = x.pin_memory().to(cfg.device, non_blocking=True), y.pin_memory().to(cfg.device, non_blocking=True) + return x, y + else: + # alternatively, if fixed data is desired to not care about data loading + x = torch.randint(50304, (cfg.batch_size, cfg.block_size), device=cfg.device) + y = torch.randint(50304, (cfg.batch_size, cfg.block_size), device=cfg.device) + get_batch = lambda split: (x, y) -# data loading init -if real_data: - dataset = 'openwebtext' - data_dir = os.path.join('data', dataset) - train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r') - def get_batch(split): - data = train_data # note ignore split in benchmarking script - ix = torch.randint(len(data) - block_size, (batch_size,)) - x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix]) - y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix]) - x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True) - return x, y -else: - # alternatively, if fixed data is desired to not care about data loading - x = torch.randint(50304, (batch_size, block_size), device=device) - y = torch.randint(50304, (batch_size, block_size), device=device) - get_batch = lambda split: (x, y) + # model init + gptconf = GPTConfig( + block_size = cfg.block_size, # how far back does the model look? i.e. context size + n_layer = 12, n_head = 12, n_embd = 768, # size of the model + dropout = 0, # for determinism + bias = cfg.bias, + ) + model = GPT(gptconf) + model.to(cfg.device) -# model init -gptconf = GPTConfig( - block_size = block_size, # how far back does the model look? i.e. context size - n_layer = 12, n_head = 12, n_embd = 768, # size of the model - dropout = 0, # for determinism - bias = bias, -) -model = GPT(gptconf) -model.to(device) + optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type) -optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type) + if compile: + print("Compiling model...") + model = torch.compile(model) # pytorch 2.0 -if compile: - print("Compiling model...") - model = torch.compile(model) # pytorch 2.0 + if cfg.profile: + # useful docs on pytorch profiler: + # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html + # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile + wait, warmup, active = 5, 5, 5 + num_steps = wait + warmup + active + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'), + record_shapes=False, + profile_memory=False, + with_stack=False, # incurs an additional overhead, disable if not needed + with_flops=True, + with_modules=False, # only for torchscript models atm + ) as prof: -if profile: - # useful docs on pytorch profiler: - # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html - # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile - wait, warmup, active = 5, 5, 5 - num_steps = wait + warmup + active - with torch.profiler.profile( - activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], - schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1), - on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'), - record_shapes=False, - profile_memory=False, - with_stack=False, # incurs an additional overhead, disable if not needed - with_flops=True, - with_modules=False, # only for torchscript models atm - ) as prof: - - X, Y = get_batch('train') - for k in range(num_steps): - with ctx: - logits, loss = model(X, Y) X, Y = get_batch('train') - optimizer.zero_grad(set_to_none=True) - loss.backward() - optimizer.step() - lossf = loss.item() - print(f"{k}/{num_steps} loss: {lossf:.4f}") + for k in range(num_steps): + with ctx: + logits, loss = model(X, Y) + X, Y = get_batch('train') + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lossf = loss.item() + print(f"{k}/{num_steps} loss: {lossf:.4f}") - prof.step() # notify the profiler at end of each step + prof.step() # notify the profiler at end of each step -else: + else: - # simple benchmarking - torch.cuda.synchronize() - for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark - t0 = time.time() - X, Y = get_batch('train') - for k in range(num_steps): - with ctx: - logits, loss = model(X, Y) - X, Y = get_batch('train') - optimizer.zero_grad(set_to_none=True) - loss.backward() - optimizer.step() - lossf = loss.item() - print(f"{k}/{num_steps} loss: {lossf:.4f}") + # simple benchmarking torch.cuda.synchronize() - t1 = time.time() - dt = t1-t0 - mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt) - if stage == 1: - print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%") + for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark + t0 = time.time() + X, Y = get_batch('train') + for k in range(num_steps): + with ctx: + logits, loss = model(X, Y) + X, Y = get_batch('train') + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lossf = loss.item() + print(f"{k}/{num_steps} loss: {lossf:.4f}") + torch.cuda.synchronize() + t1 = time.time() + dt = t1-t0 + mfu = model.estimate_mfu(cfg.batch_size * 1 * num_steps, dt) + if stage == 1: + print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%") diff --git a/config/bench.yaml b/config/bench.yaml new file mode 100644 index 0000000..b31116b --- /dev/null +++ b/config/bench.yaml @@ -0,0 +1,8 @@ +batch_size: 12 +block_size: 1024 +bias: false +real_data: true +seed: 1337 +device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. +compile: true # use PyTorch 2.0 to compile the model to be faster +profile: false # use pytorch profiler, or just simple bench \ No newline at end of file diff --git a/config/eval_gpt2.py b/config/eval_gpt2.py deleted file mode 100644 index 53978cb..0000000 --- a/config/eval_gpt2.py +++ /dev/null @@ -1,8 +0,0 @@ -# evaluate the base gpt2 -# n_layer=12, n_head=12, n_embd=768 -# 124M parameters -batch_size = 8 -eval_iters = 500 # use more iterations to get good estimate -eval_only = True -wandb_log = False -init_from = 'gpt2' diff --git a/config/eval_gpt2_large.py b/config/eval_gpt2_large.py deleted file mode 100644 index 4cbeaef..0000000 --- a/config/eval_gpt2_large.py +++ /dev/null @@ -1,8 +0,0 @@ -# evaluate the base gpt2 -# n_layer=36, n_head=20, n_embd=1280 -# 774M parameters -batch_size = 8 -eval_iters = 500 # use more iterations to get good estimate -eval_only = True -wandb_log = False -init_from = 'gpt2-large' diff --git a/config/eval_gpt2_medium.py b/config/eval_gpt2_medium.py deleted file mode 100644 index 9d0db11..0000000 --- a/config/eval_gpt2_medium.py +++ /dev/null @@ -1,8 +0,0 @@ -# evaluate the base gpt2 -# n_layer=24, n_head=16, n_embd=1024 -# 350M parameters -batch_size = 8 -eval_iters = 500 # use more iterations to get good estimate -eval_only = True -wandb_log = False -init_from = 'gpt2-medium' diff --git a/config/eval_gpt2_xl.py b/config/eval_gpt2_xl.py deleted file mode 100644 index 1bae34f..0000000 --- a/config/eval_gpt2_xl.py +++ /dev/null @@ -1,8 +0,0 @@ -# evaluate the base gpt2 -# n_layer=48, n_head=25, n_embd=1600 -# 1558M parameters -batch_size = 8 -eval_iters = 500 # use more iterations to get good estimate -eval_only = True -wandb_log = False -init_from = 'gpt2-xl' diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py deleted file mode 100644 index 148a4c4..0000000 --- a/config/finetune_shakespeare.py +++ /dev/null @@ -1,25 +0,0 @@ -import time - -out_dir = 'out-shakespeare' -eval_interval = 5 -eval_iters = 40 -wandb_log = False # feel free to turn on -wandb_project = 'shakespeare' -wandb_run_name = 'ft-' + str(time.time()) - -dataset = 'shakespeare' -init_from = 'gpt2-xl' # this is the largest GPT-2 model - -# only save checkpoints if the validation loss improves -always_save_checkpoint = False - -# the number of examples per iter: -# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter -# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters -batch_size = 1 -gradient_accumulation_steps = 32 -max_iters = 20 - -# finetune at constant LR -learning_rate = 3e-5 -decay_lr = False diff --git a/config/sample.yaml b/config/sample.yaml new file mode 100644 index 0000000..e9c191a --- /dev/null +++ b/config/sample.yaml @@ -0,0 +1,9 @@ +init_from: 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl') +out_dir: 'out' # ignored if init_from is not 'resume' +start: "\n" # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt" +num_samples: 10 # number of samples to draw +max_new_tokens: 500 # number of tokens generated in each sample +temperature: 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions +top_k: 200 # retain only the top_k most likely tokens, clamp others to have 0 probability +seed: 1337 +device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. \ No newline at end of file diff --git a/config/train.yaml b/config/train.yaml new file mode 100644 index 0000000..c8fbe7e --- /dev/null +++ b/config/train.yaml @@ -0,0 +1,39 @@ +out_dir: 'out' +eval_interval: 2000 +log_interval: 1 +eval_iters: 200 +eval_only: false # if True, script exits right after the first eval +always_save_checkpoint: true # if True, always save a checkpoint after each eval +init_from: 'scratch' # 'scratch' or 'resume' or 'gpt2*' +# wandb logging +wandb_log: false # disabled by default +wandb_project: 'owt' +wandb_run_name: 'gpt2' # 'run' + str(time.time()) +# data +dataset: 'openwebtext' +gradient_accumulation_steps: 40 # used to simulate larger batch sizes (5 * 8) +batch_size: 12 # if gradient_accumulation_steps > 1, this is the micro-batch size +block_size: 1024 +# model +n_layer: 12 +n_head: 12 +n_embd: 768 +dropout: 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias: false # do we use bias inside LayerNorm and Linear layers? +# adamw optimizer +learning_rate: 0.0006 # max learning rate (6e-4) +max_iters: 600000 # total number of training iterations +weight_decay: 0.1 +beta1: 0.9 +beta2: 0.95 +grad_clip: 1.0 # clip gradients at this value, or disable if == 0.0 +# learning rate decay settings +decay_lr: true # whether to decay the learning rate +warmup_iters: 2000 # how many steps to warm up for +lr_decay_iters: 600000 # should be ~= max_iters per Chinchilla +min_lr: 0.00006 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla +# DDP settings +ddp: 'torch' +backend: 'nccl' # 'nccl', 'gloo', etc. +# system +device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks \ No newline at end of file diff --git a/config/train_config/eval_gpt2.yaml b/config/train_config/eval_gpt2.yaml new file mode 100644 index 0000000..2da6e08 --- /dev/null +++ b/config/train_config/eval_gpt2.yaml @@ -0,0 +1,10 @@ +# evaluate the base gpt2 +# n_layer=12, n_head=12, n_embd=768 +# 124M parameters +defaults: + - ../train +batch_size: 8 +eval_iters: 500 # use more iterations to get good estimate +eval_only: True +wandb_log: False +init_from: 'gpt2' diff --git a/config/train_config/eval_gpt2_large.yaml b/config/train_config/eval_gpt2_large.yaml new file mode 100644 index 0000000..12ec87f --- /dev/null +++ b/config/train_config/eval_gpt2_large.yaml @@ -0,0 +1,10 @@ +# evaluate the base gpt2 +# n_layer=36, n_head=20, n_embd=1280 +# 774M parameters +defaults: + - ../train +batch_size: 8 +eval_iters: 500 # use more iterations to get good estimate +eval_only: true +wandb_log: false +init_from: 'gpt2-large' \ No newline at end of file diff --git a/config/train_config/eval_gpt2_medium.yaml b/config/train_config/eval_gpt2_medium.yaml new file mode 100644 index 0000000..16cb287 --- /dev/null +++ b/config/train_config/eval_gpt2_medium.yaml @@ -0,0 +1,11 @@ +# evaluate the base gpt2 +# n_layer=24, n_head=16, n_embd=1024 +# 350M parameters +defaults: + - ../train + +batch_size: 8 +eval_iters: 500 # use more iterations to get good estimate +eval_only: true +wandb_log: false +init_from: 'gpt2-medium' diff --git a/config/train_config/eval_gpt2_xl.yaml b/config/train_config/eval_gpt2_xl.yaml new file mode 100644 index 0000000..2eae760 --- /dev/null +++ b/config/train_config/eval_gpt2_xl.yaml @@ -0,0 +1,10 @@ +# evaluate the base gpt2 +# n_layer=48, n_head=25, n_embd=1600 +# 1558M parameters +defaults: + - ../train +batch_size: 8 +eval_iters: 500 # use more iterations to get good estimate +eval_only: true +wandb_log: false +init_from: 'gpt2-xl' diff --git a/config/train_config/finetune_shakespeare.yaml b/config/train_config/finetune_shakespeare.yaml new file mode 100644 index 0000000..1e3d6c0 --- /dev/null +++ b/config/train_config/finetune_shakespeare.yaml @@ -0,0 +1,26 @@ +defaults: + - ../train + +out_dir: 'out-shakespeare' +eval_interval: 5 +eval_iters: 40 +wandb_log: False # feel free to turn on +wandb_project: 'shakespeare' +wandb_run_name: 'ft-shakespeare' + +dataset: 'shakespeare' +init_from: 'gpt2-xl' # this is the largest GPT-2 model + +# only save checkpoints if the validation loss improves +always_save_checkpoint: false + +# the number of examples per iter: +# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter +# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters +batch_size: 1 +gradient_accumulation_steps: 32 +max_iters: 20 + +# finetune at constant LR +learning_rate: 3e-5 +decay_lr: false diff --git a/config/train_gpt2.py b/config/train_config/train_gpt2.yaml similarity index 61% rename from config/train_gpt2.py rename to config/train_config/train_gpt2.yaml index 8f19273..8b18fd5 100644 --- a/config/train_gpt2.py +++ b/config/train_config/train_gpt2.yaml @@ -1,25 +1,27 @@ # config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB # launch as the following (e.g. in a screen session) and wait ~5 days: # $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py +defaults: + - ../train -wandb_log = True -wandb_project = 'owt' -wandb_run_name='gpt2-124M' +wandb_log: true +wandb_project: 'owt' +wandb_run_name: 'gpt2-124M' # these make the total batch size be ~0.5M # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 -batch_size = 12 -block_size = 1024 -gradient_accumulation_steps = 5 * 8 +batch_size: 12 +block_size: 1024 +gradient_accumulation_steps: 5 * 8 # this makes total number of tokens be 300B -max_iters = 600000 -lr_decay_iters = 600000 +max_iters: 600000 +lr_decay_iters: 600000 # eval stuff -eval_interval = 1000 -eval_iters = 200 -log_interval = 10 +eval_interval: 1000 +eval_iters: 200 +log_interval: 10 # weight decay -weight_decay = 1e-1 +weight_decay: 1e-1 diff --git a/config/train_config/train_shakespeare_char.yaml b/config/train_config/train_shakespeare_char.yaml new file mode 100644 index 0000000..4523551 --- /dev/null +++ b/config/train_config/train_shakespeare_char.yaml @@ -0,0 +1,39 @@ +# train a miniature character-level shakespeare model +# good for debugging and playing on macbooks and such +defaults: + - ../train + +out_dir: 'out-shakespeare-char' +eval_interval: 250 # keep frequent because we'll overfit +eval_iters: 200 +log_interval: 10 # don't print too too often + +# we expect to overfit on this small dataset, so only save when val improves +always_save_checkpoint: false + +wandb_log: False # override via command line if you like +wandb_project: 'shakespeare-char' +wandb_run_name: 'mini-gpt' + +dataset: 'shakespeare_char' +gradient_accumulation_steps: 1 +batch_size: 64 +block_size: 256 # context of up to 256 previous characters + +# baby GPT model :) +n_layer: 6 +n_head: 6 +n_embd: 384 +dropout: 0.2 + +learning_rate: 1e-3 # with baby networks can afford to go a bit higher +max_iters: 5000 +lr_decay_iters: 5000 # make equal to max_iters usually +min_lr: 1e-4 # learning_rate / 10 usually +beta2: 0.99 # make a bit bigger because number of tokens per iter is small + +warmup_iters: 100 # not super necessary potentially + +# on macbook also add +# device = 'cpu' # run on cpu only +# compile = False # do not torch compile the model diff --git a/config/train_shakespeare_char.py b/config/train_shakespeare_char.py deleted file mode 100644 index 41c81df..0000000 --- a/config/train_shakespeare_char.py +++ /dev/null @@ -1,37 +0,0 @@ -# train a miniature character-level shakespeare model -# good for debugging and playing on macbooks and such - -out_dir = 'out-shakespeare-char' -eval_interval = 250 # keep frequent because we'll overfit -eval_iters = 200 -log_interval = 10 # don't print too too often - -# we expect to overfit on this small dataset, so only save when val improves -always_save_checkpoint = False - -wandb_log = False # override via command line if you like -wandb_project = 'shakespeare-char' -wandb_run_name = 'mini-gpt' - -dataset = 'shakespeare_char' -gradient_accumulation_steps = 1 -batch_size = 64 -block_size = 256 # context of up to 256 previous characters - -# baby GPT model :) -n_layer = 6 -n_head = 6 -n_embd = 384 -dropout = 0.2 - -learning_rate = 1e-3 # with baby networks can afford to go a bit higher -max_iters = 5000 -lr_decay_iters = 5000 # make equal to max_iters usually -min_lr = 1e-4 # learning_rate / 10 usually -beta2 = 0.99 # make a bit bigger because number of tokens per iter is small - -warmup_iters = 100 # not super necessary potentially - -# on macbook also add -# device = 'cpu' # run on cpu only -# compile = False # do not torch compile the model diff --git a/configurator.py b/configurator.py deleted file mode 100644 index a8bba95..0000000 --- a/configurator.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Poor Man's Configurator. Probably a terrible idea. Example usage: -$ python train.py config/override_file.py --batch_size=32 -this will first run config/override_file.py, then override batch_size to 32 - -The code in this file will be run as follows from e.g. train.py: ->>> exec(open('configurator.py').read()) - -So it's not a Python module, it's just shuttling this code away from train.py -The code in this script then overrides the globals() - -I know people are not going to love this, I just really dislike configuration -complexity and having to prepend config. to every single variable. If someone -comes up with a better simple Python solution I am all ears. -""" - -import sys -from ast import literal_eval - -for arg in sys.argv[1:]: - if '=' not in arg: - # assume it's the name of a config file - assert not arg.startswith('--') - config_file = arg - print(f"Overriding config with {config_file}:") - with open(config_file) as f: - print(f.read()) - exec(open(config_file).read()) - else: - # assume it's a --key=value argument - assert arg.startswith('--') - key, val = arg.split('=') - key = key[2:] - if key in globals(): - try: - # attempt to eval it it (e.g. if bool, number, or etc) - attempt = literal_eval(val) - except (SyntaxError, ValueError): - # if that goes wrong, just use the string - attempt = val - # ensure the types match ok - assert type(attempt) == type(globals()[key]) - # cross fingers - print(f"Overriding: {key} = {attempt}") - globals()[key] = attempt - else: - raise ValueError(f"Unknown config key: {key}") diff --git a/model.py b/model.py index c698f8b..b40d8a9 100644 --- a/model.py +++ b/model.py @@ -14,6 +14,8 @@ import torch import torch.nn as nn from torch.nn import functional as F +from typing import Dict, List, Optional +from omegaconf import DictConfig class LayerNorm(nn.Module): """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ @@ -49,7 +51,7 @@ def __init__(self, config): self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)) .view(1, 1, config.block_size, config.block_size)) - def forward(self, x): + def forward(self, x: torch.Tensor): B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) # calculate query, key, values for all heads in batch and move head forward to be the batch dim @@ -77,14 +79,14 @@ def forward(self, x): class MLP(nn.Module): - def __init__(self, config): + def __init__(self, config: DictConfig): super().__init__() self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias) self.gelu = nn.GELU() self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias) self.dropout = nn.Dropout(config.dropout) - def forward(self, x): + def forward(self, x: torch.Tensor): x = self.c_fc(x) x = self.gelu(x) x = self.c_proj(x) @@ -147,7 +149,7 @@ def __init__(self, config): # report number of parameters print("number of parameters: %.2fM" % (self.get_num_params()/1e6,)) - def get_num_params(self, non_embedding=True): + def get_num_params(self, non_embedding: bool=True): """ Return the number of parameters in the model. For non-embedding count (default), the position embeddings get subtracted. @@ -159,7 +161,7 @@ def get_num_params(self, non_embedding=True): n_params -= self.transformer.wpe.weight.numel() return n_params - def _init_weights(self, module): + def _init_weights(self, module: nn.Module): if isinstance(module, nn.Linear): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) if module.bias is not None: @@ -167,7 +169,7 @@ def _init_weights(self, module): elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - def forward(self, idx, targets=None): + def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor]=None): device = idx.device b, t = idx.size() assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" @@ -192,7 +194,7 @@ def forward(self, idx, targets=None): return logits, loss - def crop_block_size(self, block_size): + def crop_block_size(self, block_size: int): # model surgery to decrease the block size if necessary # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024) # but want to use a smaller block size for some smaller, simpler model @@ -204,7 +206,7 @@ def crop_block_size(self, block_size): block.attn.bias = block.attn.bias[:,:,:block_size,:block_size] @classmethod - def from_pretrained(cls, model_type, override_args=None): + def from_pretrained(cls, model_type: str, override_args: Optional[dict]=None): assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'} override_args = override_args or {} # default to empty dict # only dropout can be overridden see more notes below @@ -260,7 +262,7 @@ def from_pretrained(cls, model_type, override_args=None): return model - def configure_optimizers(self, weight_decay, learning_rate, betas, device_type): + def configure_optimizers(self, weight_decay: float, learning_rate: float, betas: float, device_type: str): # start with all of the candidate parameters param_dict = {pn: p for pn, p in self.named_parameters()} # filter out those that do not require grad @@ -303,7 +305,7 @@ def estimate_mfu(self, fwdbwd_per_iter, dt): return mfu @torch.no_grad() - def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): + def generate(self, idx: torch.Tensor, max_new_tokens: int, temperature: float=1.0, top_k: Optional[int]=None): """ Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete the sequence max_new_tokens times, feeding the predictions back into the model each time. diff --git a/sample.py b/sample.py index d25d6e0..8879011 100644 --- a/sample.py +++ b/sample.py @@ -7,83 +7,76 @@ import torch import tiktoken from model import GPTConfig, GPT +from utils import setup_torch +import hydra +from omegaconf import DictConfig -# ----------------------------------------------------------------------------- -init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl') -out_dir = 'out' # ignored if init_from is not 'resume' -start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt" -num_samples = 10 # number of samples to draw -max_new_tokens = 500 # number of tokens generated in each sample -temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions -top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability -seed = 1337 -device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. -dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' -compile = False # use PyTorch 2.0 to compile the model to be faster -exec(open('configurator.py').read()) # overrides from command line or config file -# ----------------------------------------------------------------------------- +@hydra.main(config_path="config", config_name="sample") +def main(cfg: DictConfig) -> None: -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) -torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul -torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn -device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast -ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] -ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) + # ----------------------------------------------------------------------------- + dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' + compile = False # use PyTorch 2.0 to compile the model to be faster + # ----------------------------------------------------------------------------- -# model -if init_from == 'resume': - # init from a model saved in a specific directory - ckpt_path = os.path.join(out_dir, 'ckpt.pt') - checkpoint = torch.load(ckpt_path, map_location=device) - gptconf = GPTConfig(**checkpoint['model_args']) - model = GPT(gptconf) - state_dict = checkpoint['model'] - unwanted_prefix = '_orig_mod.' - for k,v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - model.load_state_dict(state_dict) -elif init_from.startswith('gpt2'): - # init from a given GPT-2 model - model = GPT.from_pretrained(init_from, dict(dropout=0.0)) + setup_torch(cfg.seed) + device_type = 'cuda' if 'cuda' in cfg.device else 'cpu' # for later use in torch.autocast + ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] + ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) -model.eval() -model.to(device) -if compile: - model = torch.compile(model) # requires PyTorch 2.0 (optional) + # model + if cfg.init_from == 'resume': + # init from a model saved in a specific directory + ckpt_path = os.path.join(cfg.out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=cfg.device) + gptconf = GPTConfig(**checkpoint['model_args']) + model = GPT(gptconf) + state_dict = checkpoint['model'] + unwanted_prefix = '_orig_mod.' + for k,v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) + elif cfg.init_from.startswith('gpt2'): + # init from a given GPT-2 model + model = GPT.from_pretrained(cfg.init_from, dict(dropout=0.0)) -# look for the meta pickle in case it is available in the dataset folder -load_meta = False -if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these... - meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl') - load_meta = os.path.exists(meta_path) -if load_meta: - print(f"Loading meta from {meta_path}...") - with open(meta_path, 'rb') as f: - meta = pickle.load(f) - # TODO want to make this more general to arbitrary encoder/decoder schemes - stoi, itos = meta['stoi'], meta['itos'] - encode = lambda s: [stoi[c] for c in s] - decode = lambda l: ''.join([itos[i] for i in l]) -else: - # ok let's assume gpt-2 encodings by default - print("No meta.pkl found, assuming GPT-2 encodings...") - enc = tiktoken.get_encoding("gpt2") - encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"}) - decode = lambda l: enc.decode(l) + model.eval() + model.to(cfg.device) + if compile: + model = torch.compile(model) # requires PyTorch 2.0 (optional) -# encode the beginning of the prompt -if start.startswith('FILE:'): - with open(start[5:], 'r', encoding='utf-8') as f: - start = f.read() -start_ids = encode(start) -x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) + # look for the meta pickle in case it is available in the dataset folder + load_meta = False + if cfg.init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these... + meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl') + load_meta = os.path.exists(meta_path) + if load_meta: + print(f"Loading meta from {meta_path}...") + with open(meta_path, 'rb') as f: + meta = pickle.load(f) + # TODO want to make this more general to arbitrary encoder/decoder schemes + stoi, itos = meta['stoi'], meta['itos'] + encode = lambda s: [stoi[c] for c in s] + decode = lambda l: ''.join([itos[i] for i in l]) + else: + # ok let's assume gpt-2 encodings by default + print("No meta.pkl found, assuming GPT-2 encodings...") + enc = tiktoken.get_encoding("gpt2") + encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"}) + decode = lambda l: enc.decode(l) -# run generation -with torch.no_grad(): - with ctx: - for k in range(num_samples): - y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) - print(decode(y[0].tolist())) - print('---------------') + # encode the beginning of the prompt + if start.startswith('FILE:'): + with open(start[5:], 'r', encoding='utf-8') as f: + start = f.read() + start_ids = encode(start) + x = (torch.tensor(start_ids, dtype=torch.long, device=cfg.device)[None, ...]) + + # run generation + with torch.no_grad(): + with ctx: + for k in range(cfg.num_samples): + y = model.generate(x, cfg.max_new_tokens, temperature=cfg.temperature, top_k=cfg.top_k) + print(decode(y[0].tolist())) + print('---------------') diff --git a/train.py b/train.py index 951bda9..ef4f1d7 100644 --- a/train.py +++ b/train.py @@ -26,50 +26,19 @@ import torch from torch.nn.parallel import DistributedDataParallel as DDP from torch.distributed import init_process_group, destroy_process_group +import hydra +from omegaconf import DictConfig +import omegaconf +import deepspeed from model import GPTConfig, GPT +from torch.optim.lr_scheduler import _LRScheduler +from utils import setup_torch + # ----------------------------------------------------------------------------- # default config values designed to train a gpt2 (124M) on OpenWebText # I/O -out_dir = 'out' -eval_interval = 2000 -log_interval = 1 -eval_iters = 200 -eval_only = False # if True, script exits right after the first eval -always_save_checkpoint = True # if True, always save a checkpoint after each eval -init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' -# wandb logging -wandb_log = False # disabled by default -wandb_project = 'owt' -wandb_run_name = 'gpt2' # 'run' + str(time.time()) -# data -dataset = 'openwebtext' -gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes -batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size -block_size = 1024 -# model -n_layer = 12 -n_head = 12 -n_embd = 768 -dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ -bias = False # do we use bias inside LayerNorm and Linear layers? -# adamw optimizer -learning_rate = 6e-4 # max learning rate -max_iters = 600000 # total number of training iterations -weight_decay = 1e-1 -beta1 = 0.9 -beta2 = 0.95 -grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 -# learning rate decay settings -decay_lr = True # whether to decay the learning rate -warmup_iters = 2000 # how many steps to warm up for -lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla -min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla -# DDP settings -backend = 'nccl' # 'nccl', 'gloo', etc. -# system -device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler compile = True # use PyTorch 2.0 to compile the model to be faster # ----------------------------------------------------------------------------- @@ -78,259 +47,266 @@ config = {k: globals()[k] for k in config_keys} # will be useful for logging # ----------------------------------------------------------------------------- -# various inits, derived attributes, I/O setup -ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? -if ddp: - init_process_group(backend=backend) - ddp_rank = int(os.environ['RANK']) - ddp_local_rank = int(os.environ['LOCAL_RANK']) - ddp_world_size = int(os.environ['WORLD_SIZE']) - device = f'cuda:{ddp_local_rank}' - torch.cuda.set_device(device) - master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. - seed_offset = ddp_rank # each process gets a different seed - # world_size number of processes will be training simultaneously, so we can scale - # down the desired gradient accumulation iterations per process proportionally - assert gradient_accumulation_steps % ddp_world_size == 0 - gradient_accumulation_steps //= ddp_world_size -else: - # if not ddp, we are running on a single gpu, and one process - master_process = True - seed_offset = 0 - ddp_world_size = 1 -tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size -print(f"tokens per iteration will be: {tokens_per_iter:,}") +class CustomLRScheduler(_LRScheduler): + def __init__(self, optimizer, warmup_iters, lr_decay_iters, min_lr, last_epoch=-1): + self.warmup_iters = warmup_iters + self.lr_decay_iters = lr_decay_iters + self.min_lr = min_lr + super(CustomLRScheduler, self).__init__(optimizer, last_epoch) -if master_process: - os.makedirs(out_dir, exist_ok=True) -torch.manual_seed(1337 + seed_offset) -torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul -torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn -device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast -# note: float16 data type will automatically use a GradScaler -ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] -ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) + def get_lr(self): + it = self.last_epoch + if it < self.warmup_iters: + return [base_lr * it / self.warmup_iters for base_lr in self.base_lrs] + if it > self.lr_decay_iters: + return [self.min_lr for _ in self.base_lrs] + + decay_ratio = (it - self.warmup_iters) / (self.lr_decay_iters - self.warmup_iters) + assert 0 <= decay_ratio <= 1 + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 + return [self.min_lr + coeff * (base_lr - self.min_lr) for base_lr in self.base_lrs] + -# poor man's data loader -data_dir = os.path.join('data', dataset) -def get_batch(split): - # We recreate np.memmap every batch to avoid a memory leak, as per - # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122 - if split == 'train': - data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r') - else: - data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r') - ix = torch.randint(len(data) - block_size, (batch_size,)) - x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix]) - y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix]) - if device_type == 'cuda': - # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) - x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True) +@hydra.main(config_path="config", config_name="train", version_base=None) +def main(cfg: DictConfig) -> None: + setup_torch(cfg.seed) + # various inits, derived attributes, I/O setup + ddp = cfg.ddp + assert ddp in ["deepspped", "torch", False] + if ddp == "torch": + init_process_group(backend=cfg.backend) + ddp_rank = int(os.environ['RANK']) + ddp_local_rank = int(os.environ['LOCAL_RANK']) + ddp_world_size = int(os.environ['WORLD_SIZE']) + device = f'cuda:{ddp_local_rank}' + torch.cuda.set_device(device) + master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. + seed_offset = ddp_rank # each process gets a different seed + # world_size number of processes will be training simultaneously, so we can scale + # down the desired gradient accumulation iterations per process proportionally + assert cfg.gradient_accumulation_steps % ddp_world_size == 0 + cfg.gradient_accumulation_steps //= ddp_world_size else: - x, y = x.to(device), y.to(device) - return x, y + # if not ddp, we are running on a single gpu, and one process + master_process = True + seed_offset = 0 + ddp_world_size = 1 + tokens_per_iter = cfg.gradient_accumulation_steps * ddp_world_size * cfg.batch_size * cfg.block_size + print(f"tokens per iteration will be: {tokens_per_iter:,}") -# init these up here, can override if init_from='resume' (i.e. from a checkpoint) -iter_num = 0 -best_val_loss = 1e9 + if master_process: + os.makedirs(cfg.out_dir, exist_ok=True) + device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast + # note: float16 data type will automatically use a GradScaler + ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] + ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) -# attempt to derive vocab_size from the dataset -meta_path = os.path.join(data_dir, 'meta.pkl') -meta_vocab_size = None -if os.path.exists(meta_path): - with open(meta_path, 'rb') as f: - meta = pickle.load(f) - meta_vocab_size = meta['vocab_size'] - print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})") + # poor man's data loader + data_dir = os.path.join('data', cfg.dataset) + def get_batch(split): + # We recreate np.memmap every batch to avoid a memory leak, as per + # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122 + if split == 'train': + data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r') + else: + data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r') + ix = torch.randint(len(data) - cfg.block_size, (cfg.batch_size,)) + x = torch.stack([torch.from_numpy((data[i:i+cfg.block_size]).astype(np.int64)) for i in ix]) + y = torch.stack([torch.from_numpy((data[i+1:i+1+cfg.block_size]).astype(np.int64)) for i in ix]) + if device_type == 'cuda': + # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) + x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True) + else: + x, y = x.to(device), y.to(device) + return x, y -# model init -model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size, - bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line -if init_from == 'scratch': - # init a new model from scratch - print("Initializing a new model from scratch") - # determine the vocab size we'll use for from-scratch training - if meta_vocab_size is None: - print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)") - model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 - gptconf = GPTConfig(**model_args) - model = GPT(gptconf) -elif init_from == 'resume': - print(f"Resuming training from {out_dir}") - # resume training from a checkpoint. - ckpt_path = os.path.join(out_dir, 'ckpt.pt') - checkpoint = torch.load(ckpt_path, map_location=device) - checkpoint_model_args = checkpoint['model_args'] - # force these config attributes to be equal otherwise we can't even resume training - # the rest of the attributes (e.g. dropout) can stay as desired from command line - for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: - model_args[k] = checkpoint_model_args[k] - # create the model - gptconf = GPTConfig(**model_args) - model = GPT(gptconf) - state_dict = checkpoint['model'] - # fix the keys of the state dictionary :( - # honestly no idea how checkpoints sometimes get this prefix, have to debug more - unwanted_prefix = '_orig_mod.' - for k,v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - model.load_state_dict(state_dict) - iter_num = checkpoint['iter_num'] - best_val_loss = checkpoint['best_val_loss'] -elif init_from.startswith('gpt2'): - print(f"Initializing from OpenAI GPT-2 weights: {init_from}") - # initialize from OpenAI GPT-2 weights - override_args = dict(dropout=dropout) - model = GPT.from_pretrained(init_from, override_args) - # read off the created config params, so we can store them into checkpoint correctly - for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: - model_args[k] = getattr(model.config, k) -# crop down the model block size if desired, using model surgery -if block_size < model.config.block_size: - model.crop_block_size(block_size) - model_args['block_size'] = block_size # so that the checkpoint will have the right value -model.to(device) + # init these up here, can override if init_from='resume' (i.e. from a checkpoint) + iter_num = 0 + best_val_loss = 1e9 -# initialize a GradScaler. If enabled=False scaler is a no-op -scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) + # attempt to derive vocab_size from the dataset + meta_path = os.path.join(data_dir, 'meta.pkl') + meta_vocab_size = None + if os.path.exists(meta_path): + with open(meta_path, 'rb') as f: + meta = pickle.load(f) + meta_vocab_size = meta['vocab_size'] + print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})") -# optimizer -optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) -if init_from == 'resume': - optimizer.load_state_dict(checkpoint['optimizer']) -checkpoint = None # free up memory + # model init + model_args = dict(n_layer=cfg.n_layer, n_head=cfg.n_head, n_embd=cfg.n_embd, block_size=cfg.block_size, + bias=cfg.bias, vocab_size=None, dropout=cfg.dropout) # start with model_args from command line + if cfg.init_from == 'scratch': + # init a new model from scratch + print("Initializing a new model from scratch") + # determine the vocab size we'll use for from-scratch training + if meta_vocab_size is None: + print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)") + model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304 + gptconf = GPTConfig(**model_args) + model = GPT(gptconf) + elif cfg.init_from == 'resume': + print(f"Resuming training from {cfg.out_dir}") + # resume training from a checkpoint. + ckpt_path = os.path.join(cfg.out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint_model_args = checkpoint['model_args'] + # force these config attributes to be equal otherwise we can't even resume training + # the rest of the attributes (e.g. dropout) can stay as desired from command line + for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: + model_args[k] = checkpoint_model_args[k] + # create the model + gptconf = GPTConfig(**model_args) + model = GPT(gptconf) + state_dict = checkpoint['model'] + # fix the keys of the state dictionary :( + # honestly no idea how checkpoints sometimes get this prefix, have to debug more + unwanted_prefix = '_orig_mod.' + for k,v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict) + iter_num = checkpoint['iter_num'] + best_val_loss = checkpoint['best_val_loss'] + elif cfg.init_from.startswith('gpt2'): + print(f"Initializing from OpenAI GPT-2 weights: {cfg.init_from}") + # initialize from OpenAI GPT-2 weights + override_args = dict(dropout=cfg.dropout) + model = GPT.from_pretrained(cfg.init_from, override_args) + # read off the created config params, so we can store them into checkpoint correctly + for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']: + model_args[k] = getattr(model.config, k) + # crop down the model block size if desired, using model surgery + if cfg.block_size < model.config.block_size: + model.crop_block_size(cfg.block_size) + model_args['block_size'] = cfg.block_size # so that the checkpoint will have the right value + model.to(device) -# compile the model -if compile: - print("compiling the model... (takes a ~minute)") - unoptimized_model = model - model = torch.compile(model) # requires PyTorch 2.0 + # initialize a GradScaler. If enabled=False scaler is a no-op + scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) -# wrap model into DDP container -if ddp: - model = DDP(model, device_ids=[ddp_local_rank]) + # optimizer + optimizer = model.configure_optimizers(cfg.weight_decay, cfg.learning_rate, (cfg.beta1, cfg.beta2), device_type) + if cfg.init_from == 'resume': + optimizer.load_state_dict(checkpoint['optimizer']) + checkpoint = None # free up memory -# helps estimate an arbitrarily accurate loss over either split using many batches -@torch.no_grad() -def estimate_loss(): - out = {} - model.eval() - for split in ['train', 'val']: - losses = torch.zeros(eval_iters) - for k in range(eval_iters): - X, Y = get_batch(split) - with ctx: - logits, loss = model(X, Y) - losses[k] = loss.item() - out[split] = losses.mean() - model.train() - return out + # compile the model + if compile: + print("compiling the model... (takes a ~minute)") + unoptimized_model = model + model = torch.compile(model) # requires PyTorch 2.0 -# learning rate decay scheduler (cosine with warmup) -def get_lr(it): - # 1) linear warmup for warmup_iters steps - if it < warmup_iters: - return learning_rate * it / warmup_iters - # 2) if it > lr_decay_iters, return min learning rate - if it > lr_decay_iters: - return min_lr - # 3) in between, use cosine decay down to min learning rate - decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) - assert 0 <= decay_ratio <= 1 - coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 - return min_lr + coeff * (learning_rate - min_lr) + # wrap model into DDP container + if ddp == "torch": + model = DDP(model, device_ids=[ddp_local_rank]) -# logging -if wandb_log and master_process: - import wandb - wandb.init(project=wandb_project, name=wandb_run_name, config=config) + # scheduler + scheduler = CustomLRScheduler(optimizer, cfg.warmup_iters, cfg.lr_decay_iters, cfg.min_lr) -# training loop -X, Y = get_batch('train') # fetch the very first batch -t0 = time.time() -local_iter_num = 0 # number of iterations in the lifetime of this process -raw_model = model.module if ddp else model # unwrap DDP container if needed -running_mfu = -1.0 -while True: + # helps estimate an arbitrarily accurate loss over either split using many batches + @torch.no_grad() + def estimate_loss(): + out = {} + model.eval() + for split in ['train', 'val']: + losses = torch.zeros(cfg.eval_iters) + for k in range(cfg.eval_iters): + X, Y = get_batch(split) + with ctx: + logits, loss = model(X, Y) + losses[k] = loss.item() + out[split] = losses.mean() + model.train() + return out - # determine and set the learning rate for this iteration - lr = get_lr(iter_num) if decay_lr else learning_rate - for param_group in optimizer.param_groups: - param_group['lr'] = lr + # logging + if cfg.wandb_log and master_process: + import wandb + wandb.init(project=cfg.wandb_project, name=cfg.wandb_run_name, config=omegaconf.OmegaConf.to_container(cfg, resolve=True)) - # evaluate the loss on train/val sets and write checkpoints - if iter_num % eval_interval == 0 and master_process: - losses = estimate_loss() - print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") - if wandb_log: - wandb.log({ - "iter": iter_num, - "train/loss": losses['train'], - "val/loss": losses['val'], - "lr": lr, - "mfu": running_mfu*100, # convert to percentage - }) - if losses['val'] < best_val_loss or always_save_checkpoint: - best_val_loss = losses['val'] - if iter_num > 0: - checkpoint = { - 'model': raw_model.state_dict(), - 'optimizer': optimizer.state_dict(), - 'model_args': model_args, - 'iter_num': iter_num, - 'best_val_loss': best_val_loss, - 'config': config, - } - print(f"saving checkpoint to {out_dir}") - torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt')) - if iter_num == 0 and eval_only: - break + # training loop + X, Y = get_batch('train') # fetch the very first batch + t0 = time.time() + local_iter_num = 0 # number of iterations in the lifetime of this process + raw_model = model.module if ddp else model # unwrap DDP container if needed + running_mfu = -1.0 + while True: - # forward backward update, with optional gradient accumulation to simulate larger batch size - # and using the GradScaler if data type is float16 - for micro_step in range(gradient_accumulation_steps): - if ddp: - # in DDP training we only need to sync gradients at the last micro step. - # the official way to do this is with model.no_sync() context manager, but - # I really dislike that this bloats the code and forces us to repeat code - # looking at the source of that context manager, it just toggles this variable - model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1) - with ctx: - logits, loss = model(X, Y) - loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation - # immediately async prefetch next batch while model is doing the forward pass on the GPU - X, Y = get_batch('train') - # backward pass, with gradient scaling if training in fp16 - scaler.scale(loss).backward() - # clip the gradient - if grad_clip != 0.0: - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - # step the optimizer and scaler if training in fp16 - scaler.step(optimizer) - scaler.update() - # flush the gradients as soon as we can, no need for this memory anymore - optimizer.zero_grad(set_to_none=True) + # evaluate the loss on train/val sets and write checkpoints + if iter_num % cfg.eval_interval == 0 and master_process: + losses = estimate_loss() + print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") + if cfg.wandb_log: + wandb.log({ + "iter": iter_num, + "train/loss": losses['train'], + "val/loss": losses['val'], + "lr": lr, + "mfu": running_mfu*100, # convert to percentage + }) + if losses['val'] < best_val_loss or cfg.always_save_checkpoint: + best_val_loss = losses['val'] + if iter_num > 0: + checkpoint = { + 'model': raw_model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'model_args': model_args, + 'iter_num': iter_num, + 'best_val_loss': best_val_loss, + 'config': cfg, + } + print(f"saving checkpoint to {cfg.out_dir}") + torch.save(checkpoint, os.path.join(cfg.out_dir, 'ckpt.pt')) + if iter_num == 0 and cfg.eval_only: + break + + # forward backward update, with optional gradient accumulation to simulate larger batch size + # and using the GradScaler if data type is float16 + for micro_step in range(cfg.gradient_accumulation_steps): + if ddp == "torch": + # in DDP training we only need to sync gradients at the last micro step. + # the official way to do this is with model.no_sync() context manager, but + # I really dislike that this bloats the code and forces us to repeat code + # looking at the source of that context manager, it just toggles this variable + model.require_backward_grad_sync = (micro_step == cfg.gradient_accumulation_steps - 1) + with ctx: + logits, loss = model(X, Y) + loss = loss / cfg.gradient_accumulation_steps # scale the loss to account for gradient accumulation + # immediately async prefetch next batch while model is doing the forward pass on the GPU + X, Y = get_batch('train') + # backward pass, with gradient scaling if training in fp16 + scaler.scale(loss).backward() + # clip the gradient + if cfg.grad_clip != 0.0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip) + # step the optimizer and scaler if training in fp16 + scaler.step(optimizer) + scaler.update() + # step the scheduler + scheduler.step() + # flush the gradients as soon as we can, no need for this memory anymore + optimizer.zero_grad(set_to_none=True) - # timing and logging - t1 = time.time() - dt = t1 - t0 - t0 = t1 - if iter_num % log_interval == 0 and master_process: - # get loss as float. note: this is a CPU-GPU sync point - # scale up to undo the division above, approximating the true total loss (exact would have been a sum) - lossf = loss.item() * gradient_accumulation_steps - if local_iter_num >= 5: # let the training loop settle a bit - mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt) - running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu - print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%") - iter_num += 1 - local_iter_num += 1 + # timing and logging + t1 = time.time() + dt = t1 - t0 + t0 = t1 + if iter_num % cfg.log_interval == 0 and master_process: + # get loss as float. note: this is a CPU-GPU sync point + # scale up to undo the division above, approximating the true total loss (exact would have been a sum) + lossf = loss.item() * cfg.gradient_accumulation_steps + if local_iter_num >= 5: # let the training loop settle a bit + mfu = raw_model.estimate_mfu(cfg.batch_size * cfg.gradient_accumulation_steps, dt) + running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu + print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%") + iter_num += 1 + local_iter_num += 1 - # termination conditions - if iter_num > max_iters: - break + # termination conditions + if iter_num > cfg.max_iters: + break -if ddp: - destroy_process_group() + if ddp == "torch": + destroy_process_group() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..7c4a999 --- /dev/null +++ b/utils.py @@ -0,0 +1,10 @@ +import torch + +def setup_torch(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul + torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn \ No newline at end of file From cef232e2dd237aad60819ca8285a84584a286fe1 Mon Sep 17 00:00:00 2001 From: kai-yamashita Date: Thu, 1 Aug 2024 14:44:32 +0900 Subject: [PATCH 2/2] fefactor: add dataclass --- bench.py | 4 +-- config/dataclass.py | 60 +++++++++++++++++++++++++++++++++++++++++++++ model.py | 4 +-- sample.py | 4 +-- train.py | 4 +-- 5 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 config/dataclass.py diff --git a/bench.py b/bench.py index 7e2f995..d3ac432 100644 --- a/bench.py +++ b/bench.py @@ -8,12 +8,12 @@ import torch from model import GPTConfig, GPT import hydra -from omegaconf import DictConfig +from config.dataclass import Config dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' @hydra.main(config_path="config", config_name="bench") -def main(cfg: DictConfig) -> None: +def main(cfg: Config) -> None: device_type = 'cuda' if 'cuda' in cfg.device else 'cpu' # for later use in torch.autocast ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) diff --git a/config/dataclass.py b/config/dataclass.py new file mode 100644 index 0000000..1897126 --- /dev/null +++ b/config/dataclass.py @@ -0,0 +1,60 @@ +#dataclass +from dataclasses import dataclass + +@dataclass +class Config: + # General settings + out_dir: str = 'out' + eval_interval: int = 2000 + log_interval: int = 1 + eval_iters: int = 200 + eval_only: bool = False # if True, script exits right after the first eval + always_save_checkpoint: bool = True # if True, always save a checkpoint after each eval + init_from: str = 'scratch' # 'scratch' or 'resume' or 'gpt2*' + + # wandb logging + wandb_log: bool = False # disabled by default + wandb_project: str = 'owt' + wandb_run_name: str = 'gpt2' # 'run' + str(time.time()) + + # Data settings + dataset: str = 'openwebtext' + gradient_accumulation_steps: int = 40 # used to simulate larger batch sizes (5 * 8) + batch_size: int = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size + block_size: int = 1024 + + # Model settings + n_layer: int = 12 + n_head: int = 12 + n_embd: int = 768 + dropout: float = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ + bias: bool = False # do we use bias inside LayerNorm and Linear layers? + + # AdamW optimizer settings + learning_rate: float = 0.0006 # max learning rate (6e-4) + max_iters: int = 600000 # total number of training iterations + weight_decay: float = 0.1 + beta1: float = 0.9 + beta2: float = 0.95 + grad_clip: float = 1.0 # clip gradients at this value, or disable if == 0.0 + + # Learning rate decay settings + decay_lr: bool = True # whether to decay the learning rate + warmup_iters: int = 2000 # how many steps to warm up for + lr_decay_iters: int = 600000 # should be ~= max_iters per Chinchilla + min_lr: float = 0.00006 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla + + # DDP settings + ddp: str = 'torch' + backend: str = 'nccl' # 'nccl', 'gloo', etc. + start: str = "\n" # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt" + num_samples: int = 10 # number of samples to draw + max_new_tokens: int = 500 # number of tokens generated in each sample + temperature: float = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions + top_k: int = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability + bias: bool = False + real_data: bool = True + seed: int = 1337 + compile: bool = True # use PyTorch 2.0 to compile the model to be faster + profile: bool = False # use pytorch profiler, or just simple bench + device: str = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks \ No newline at end of file diff --git a/model.py b/model.py index b40d8a9..0eaaaa3 100644 --- a/model.py +++ b/model.py @@ -15,7 +15,7 @@ import torch.nn as nn from torch.nn import functional as F from typing import Dict, List, Optional -from omegaconf import DictConfig +from config.dataclass import Config class LayerNorm(nn.Module): """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ @@ -79,7 +79,7 @@ def forward(self, x: torch.Tensor): class MLP(nn.Module): - def __init__(self, config: DictConfig): + def __init__(self, config: Config): super().__init__() self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias) self.gelu = nn.GELU() diff --git a/sample.py b/sample.py index 8879011..69197c7 100644 --- a/sample.py +++ b/sample.py @@ -9,10 +9,10 @@ from model import GPTConfig, GPT from utils import setup_torch import hydra -from omegaconf import DictConfig +from config.dataclass import Config @hydra.main(config_path="config", config_name="sample") -def main(cfg: DictConfig) -> None: +def main(cfg: Config) -> None: # ----------------------------------------------------------------------------- dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' diff --git a/train.py b/train.py index ef4f1d7..03bef2a 100644 --- a/train.py +++ b/train.py @@ -27,7 +27,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP from torch.distributed import init_process_group, destroy_process_group import hydra -from omegaconf import DictConfig +from config.dataclass import Config import omegaconf import deepspeed @@ -68,7 +68,7 @@ def get_lr(self): @hydra.main(config_path="config", config_name="train", version_base=None) -def main(cfg: DictConfig) -> None: +def main(cfg: Config) -> None: setup_torch(cfg.seed) # various inits, derived attributes, I/O setup ddp = cfg.ddp