Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 83 additions & 95 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,111 +7,99 @@
import time
import torch
from model import GPTConfig, GPT
import hydra
from config.dataclass import Config

# -----------------------------------------------------------------------------
batch_size = 12
block_size = 1024
bias = False
real_data = True
seed = 1337
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = True # use PyTorch 2.0 to compile the model to be faster
profile = False # use pytorch profiler, or just simple benchmarking?
exec(open('configurator.py').read()) # overrides from command line or config file
# -----------------------------------------------------------------------------

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
@hydra.main(config_path="config", config_name="bench")
def main(cfg: Config) -> None:
device_type = 'cuda' if 'cuda' in cfg.device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# data loading init
if cfg.real_data:
dataset = 'openwebtext'
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
def get_batch(split):
data = train_data # note ignore split in benchmarking script
ix = torch.randint(len(data) - cfg.block_size, (cfg.batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+cfg.block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+cfg.block_size]).astype(np.int64)) for i in ix])
x, y = x.pin_memory().to(cfg.device, non_blocking=True), y.pin_memory().to(cfg.device, non_blocking=True)
return x, y
else:
# alternatively, if fixed data is desired to not care about data loading
x = torch.randint(50304, (cfg.batch_size, cfg.block_size), device=cfg.device)
y = torch.randint(50304, (cfg.batch_size, cfg.block_size), device=cfg.device)
get_batch = lambda split: (x, y)

# data loading init
if real_data:
dataset = 'openwebtext'
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
def get_batch(split):
data = train_data # note ignore split in benchmarking script
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
return x, y
else:
# alternatively, if fixed data is desired to not care about data loading
x = torch.randint(50304, (batch_size, block_size), device=device)
y = torch.randint(50304, (batch_size, block_size), device=device)
get_batch = lambda split: (x, y)
# model init
gptconf = GPTConfig(
block_size = cfg.block_size, # how far back does the model look? i.e. context size
n_layer = 12, n_head = 12, n_embd = 768, # size of the model
dropout = 0, # for determinism
bias = cfg.bias,
)
model = GPT(gptconf)
model.to(cfg.device)

# model init
gptconf = GPTConfig(
block_size = block_size, # how far back does the model look? i.e. context size
n_layer = 12, n_head = 12, n_embd = 768, # size of the model
dropout = 0, # for determinism
bias = bias,
)
model = GPT(gptconf)
model.to(device)
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)

optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
if compile:
print("Compiling model...")
model = torch.compile(model) # pytorch 2.0

if compile:
print("Compiling model...")
model = torch.compile(model) # pytorch 2.0
if cfg.profile:
# useful docs on pytorch profiler:
# - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
# - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
wait, warmup, active = 5, 5, 5
num_steps = wait + warmup + active
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
record_shapes=False,
profile_memory=False,
with_stack=False, # incurs an additional overhead, disable if not needed
with_flops=True,
with_modules=False, # only for torchscript models atm
) as prof:

if profile:
# useful docs on pytorch profiler:
# - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
# - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
wait, warmup, active = 5, 5, 5
num_steps = wait + warmup + active
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
record_shapes=False,
profile_memory=False,
with_stack=False, # incurs an additional overhead, disable if not needed
with_flops=True,
with_modules=False, # only for torchscript models atm
) as prof:

X, Y = get_batch('train')
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")

prof.step() # notify the profiler at end of each step
prof.step() # notify the profiler at end of each step

else:
else:

# simple benchmarking
torch.cuda.synchronize()
for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
t0 = time.time()
X, Y = get_batch('train')
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
# simple benchmarking
torch.cuda.synchronize()
t1 = time.time()
dt = t1-t0
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
if stage == 1:
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
t0 = time.time()
X, Y = get_batch('train')
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
torch.cuda.synchronize()
t1 = time.time()
dt = t1-t0
mfu = model.estimate_mfu(cfg.batch_size * 1 * num_steps, dt)
if stage == 1:
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
8 changes: 8 additions & 0 deletions config/bench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
batch_size: 12
block_size: 1024
bias: false
real_data: true
seed: 1337
device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
compile: true # use PyTorch 2.0 to compile the model to be faster
profile: false # use pytorch profiler, or just simple bench
60 changes: 60 additions & 0 deletions config/dataclass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#dataclass
from dataclasses import dataclass

@dataclass
class Config:
# General settings
out_dir: str = 'out'
eval_interval: int = 2000
log_interval: int = 1
eval_iters: int = 200
eval_only: bool = False # if True, script exits right after the first eval
always_save_checkpoint: bool = True # if True, always save a checkpoint after each eval
init_from: str = 'scratch' # 'scratch' or 'resume' or 'gpt2*'

# wandb logging
wandb_log: bool = False # disabled by default
wandb_project: str = 'owt'
wandb_run_name: str = 'gpt2' # 'run' + str(time.time())

# Data settings
dataset: str = 'openwebtext'
gradient_accumulation_steps: int = 40 # used to simulate larger batch sizes (5 * 8)
batch_size: int = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size: int = 1024

# Model settings
n_layer: int = 12
n_head: int = 12
n_embd: int = 768
dropout: float = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias: bool = False # do we use bias inside LayerNorm and Linear layers?

# AdamW optimizer settings
learning_rate: float = 0.0006 # max learning rate (6e-4)
max_iters: int = 600000 # total number of training iterations
weight_decay: float = 0.1
beta1: float = 0.9
beta2: float = 0.95
grad_clip: float = 1.0 # clip gradients at this value, or disable if == 0.0

# Learning rate decay settings
decay_lr: bool = True # whether to decay the learning rate
warmup_iters: int = 2000 # how many steps to warm up for
lr_decay_iters: int = 600000 # should be ~= max_iters per Chinchilla
min_lr: float = 0.00006 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla

# DDP settings
ddp: str = 'torch'
backend: str = 'nccl' # 'nccl', 'gloo', etc.
start: str = "\n" # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples: int = 10 # number of samples to draw
max_new_tokens: int = 500 # number of tokens generated in each sample
temperature: float = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k: int = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
bias: bool = False
real_data: bool = True
seed: int = 1337
compile: bool = True # use PyTorch 2.0 to compile the model to be faster
profile: bool = False # use pytorch profiler, or just simple bench
device: str = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
8 changes: 0 additions & 8 deletions config/eval_gpt2.py

This file was deleted.

8 changes: 0 additions & 8 deletions config/eval_gpt2_large.py

This file was deleted.

8 changes: 0 additions & 8 deletions config/eval_gpt2_medium.py

This file was deleted.

8 changes: 0 additions & 8 deletions config/eval_gpt2_xl.py

This file was deleted.

25 changes: 0 additions & 25 deletions config/finetune_shakespeare.py

This file was deleted.

9 changes: 9 additions & 0 deletions config/sample.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
init_from: 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir: 'out' # ignored if init_from is not 'resume'
start: "\n" # or "" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples: 10 # number of samples to draw
max_new_tokens: 500 # number of tokens generated in each sample
temperature: 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k: 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed: 1337
device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
39 changes: 39 additions & 0 deletions config/train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
out_dir: 'out'
eval_interval: 2000
log_interval: 1
eval_iters: 200
eval_only: false # if True, script exits right after the first eval
always_save_checkpoint: true # if True, always save a checkpoint after each eval
init_from: 'scratch' # 'scratch' or 'resume' or 'gpt2*'
# wandb logging
wandb_log: false # disabled by default
wandb_project: 'owt'
wandb_run_name: 'gpt2' # 'run' + str(time.time())
# data
dataset: 'openwebtext'
gradient_accumulation_steps: 40 # used to simulate larger batch sizes (5 * 8)
batch_size: 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size: 1024
# model
n_layer: 12
n_head: 12
n_embd: 768
dropout: 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias: false # do we use bias inside LayerNorm and Linear layers?
# adamw optimizer
learning_rate: 0.0006 # max learning rate (6e-4)
max_iters: 600000 # total number of training iterations
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr: true # whether to decay the learning rate
warmup_iters: 2000 # how many steps to warm up for
lr_decay_iters: 600000 # should be ~= max_iters per Chinchilla
min_lr: 0.00006 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
ddp: 'torch'
backend: 'nccl' # 'nccl', 'gloo', etc.
# system
device: 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
10 changes: 10 additions & 0 deletions config/train_config/eval_gpt2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# evaluate the base gpt2
# n_layer=12, n_head=12, n_embd=768
# 124M parameters
defaults:
- ../train
batch_size: 8
eval_iters: 500 # use more iterations to get good estimate
eval_only: True
wandb_log: False
init_from: 'gpt2'
Loading