Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from dataclasses import dataclass, field
from typing import ClassVar, Optional


@dataclass
class ModelConfig:
block_size: Optional[int] = None
vocab_size: Optional[int] = None
n_layer: Optional[int] = None
n_head: Optional[int] = None
n_embd: Optional[int] = None
dropout: Optional[float] = None
bias: Optional[bool] = None


@dataclass
class OptimizerConfig:
name: Optional[str] = None
learning_rate: Optional[float] = None
weight_decay: Optional[float] = None
beta1: Optional[float] = None
beta2: Optional[float] = None


@dataclass
class TrainConfig:
batch_size: Optional[int] = None
local_batch_size: Optional[int] = None
block_size: Optional[int] = None
max_iters: Optional[int] = None
optimizer: OptimizerConfig = field(default_factory=OptimizerConfig)
grad_clip: Optional[float] = None
decay_lr: Optional[bool] = None
warmup_iters: Optional[int] = None
lr_decay_iters: Optional[int] = None
min_lr: Optional[float] = None
eval_interval: Optional[int] = None
log_interval: Optional[int] = None
eval_iters: Optional[int] = None
eval_only: Optional[bool] = None
always_save_checkpoint: Optional[bool] = None
init_from: Optional[str] = None


@dataclass
class DatasetConfig:
name: Optional[str] = None
default_config: ClassVar[Config] = field(
default_factory=Config
)


@dataclass
class OpenWebTextConfig(DatasetConfig):
name: Optional[str] = None
default_config: ClassVar[Config] = field(
default_factory=lambda: Config(
model = ModelConfig(
block_size=1024,
vocab_size=50304,
n_layer=12,
n_head=12,
n_embd=768,
dropout=0.0,
bias=False
),
train = TrainConfig(
batch_size=480,
local_batch_size=12,
block_size=1024,
max_iters=600_000,
optimizer=field(
default_factory=lambda: OptimizerConfig(
name="adam",
learning_rate=6e-4,
weight_decay=0.1,
beta1=0.9,
beta2=0.95
)
),
grad_clip=1.0,
decay_lr=True,
warmup_iters=2000,
lr_decay_iters=600_000,
min_lr=6e-5,
eval_interval=2000,
log_interval=1,
eval_iters=200,
eval_only=False,
always_save_checkpoint=True,
init_from="scratch",
),
out_dir="out",
wandb_log=False,
wandb_project="owt",
wandb_run_name="gpt2"
)
)


@dataclass
class ShakespeareConfig(DatasetConfig):
name: str = "shakespeare"
default_config: ClassVar[Config] = field(
default_factory=lambda: Config(
model = ModelConfig(
block_size=256,
vocab_size=65,
n_layer=6,
n_head=6,
n_embd=384,
dropout=0.2
bias=False
),
train = TrainConfig(
batch_size=64,
local_batch_size=64,
block_size=256,
max_iters=5000,
optimizer=field(
default_factory=lambda: OptimizerConfig(
name="adam",
learning_rate=1e-3,
weight_decay=0.1,
beta1=0.9,
beta2=0.99
)
),
grad_clip=1.0,
decay_lr=True,
warmup_iters=100,
lr_decay_iters=5000,
min_lr=1e-4,
eval_interval=250,
log_interval=10,
eval_iters=200,
eval_only=False,
always_save_checkpoint=False,
init_from="scratch",
),
out_dir="out-shakespeare-char",
wandb_log=False,
wandb_project="shakespeare-char",
wandb_run_name="mini-gpt"
)
)


@dataclass
class Config:
defaults: List[Any] = field(
default_factory=lambda: [{"dataset": "openwebtext"}, "_self_"]
)
model: ModelConfig = field(default_factory=ModelConfig)
train: TrainConfig = field(default_factory=TrainConfig)
dataset: DatasetConfig = field(default_factory=DatasetConfig)
out_dir: str = "out"
wandb_log: bool = False
wandb_project: str = "owt"
wandb_run_name: str = "gpt2"

def __post_init__(self):
if self.train.eval_only:
if self.train.batch_size is None:
self.train.batch_size = 8
if self.train.local_batch_size is None:
self.train.local_batch_size = 8
if self.train.eval_iters is None:
self.train.eval_iters = 500

for key, value in vars(self.dataset.default_config).items():
if key == "model":
for k_m, v_m in vars(self.dataset.default_config.model).items():
if getattr(self.model, k_m) is None:
setattr(self.model, k_m, v_m)

elif key == "train":
for k_t, v_t in vars(self.dataset.default_config.train).items():
if k_t == "optimizer":
for k_o, v_o in vars(self.dataset.default_config.train.optimizer).items():
if getattr(self.train.optimizer, k_o) is None:
setattr(self.train.optimizer, k_o, v_o)

if getattr(self.train, k_t) is None:
setattr(self.train, k_t, v_t)

else:
if getattr(self, key) is None:
setattr(self, key, value)
47 changes: 0 additions & 47 deletions configurator.py

This file was deleted.

47 changes: 20 additions & 27 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
import torch.nn as nn
from torch.nn import functional as F


class LayerNorm(nn.Module):
""" LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

def __init__(self, ndim, bias):
super().__init__()
self.weight = nn.Parameter(torch.ones(ndim))
Expand All @@ -26,8 +26,8 @@ def __init__(self, ndim, bias):
def forward(self, input):
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):

class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
Expand Down Expand Up @@ -75,8 +75,8 @@ def forward(self, x):
y = self.resid_dropout(self.c_proj(y))
return y

class MLP(nn.Module):

class MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
Expand All @@ -92,7 +92,6 @@ def forward(self, x):
return x

class Block(nn.Module):

def __init__(self, config):
super().__init__()
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
Expand All @@ -105,18 +104,8 @@ def forward(self, x):
x = x + self.mlp(self.ln_2(x))
return x

@dataclass
class GPTConfig:
block_size: int = 1024
vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: int = 12
n_head: int = 12
n_embd: int = 768
dropout: float = 0.0
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

class GPT(nn.Module):

def __init__(self, config):
super().__init__()
assert config.vocab_size is not None
Expand Down Expand Up @@ -213,22 +202,21 @@ def from_pretrained(cls, model_type, override_args=None):
print("loading weights from pretrained gpt: %s" % model_type)

# n_layer, n_head and n_embd are determined from model_type
config_args = {
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
config = {
'gpt2': ModelConfig(n_layer=12, n_head=12, n_embd=768), # 124M params
'gpt2-medium': ModelConfig(n_layer=24, n_head=16, n_embd=1024), # 350M params
'gpt2-large': ModelConfig(n_layer=36, n_head=20, n_embd=1280), # 774M params
'gpt2-xl': ModelConfig(n_layer=48, n_head=25, n_embd=1600), # 1558M params
}[model_type]
print("forcing vocab_size=50257, block_size=1024, bias=True")
config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
config_args['bias'] = True # always True for GPT model checkpoints
config.vocab_size = 50257 # always 50257 for GPT model checkpoints
config.block_size = 1024 # always 1024 for GPT model checkpoints
config.bias = True # always True for GPT model checkpoints
# we can override the dropout rate, if desired
if 'dropout' in override_args:
print(f"overriding dropout rate to {override_args['dropout']}")
config_args['dropout'] = override_args['dropout']
config.dropout = override_args['dropout']
# create a from-scratch initialized minGPT model
config = GPTConfig(**config_args)
model = GPT(config)
sd = model.state_dict()
sd_keys = sd.keys()
Expand Down Expand Up @@ -260,7 +248,7 @@ def from_pretrained(cls, model_type, override_args=None):

return model

def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
def configure_optimizers(self, optim_cfg, device_type):
# start with all of the candidate parameters
param_dict = {pn: p for pn, p in self.named_parameters()}
# filter out those that do not require grad
Expand All @@ -270,7 +258,7 @@ def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
{'params': decay_params, 'weight_decay': weight_decay},
{'params': decay_params, 'weight_decay': optim_cfg.weight_decay},
{'params': nodecay_params, 'weight_decay': 0.0}
]
num_decay_params = sum(p.numel() for p in decay_params)
Expand All @@ -281,7 +269,12 @@ def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and device_type == 'cuda'
extra_args = dict(fused=True) if use_fused else dict()
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
optimizer = torch.optim.AdamW(
optim_groups,
lr=optim_cfg.learning_rate,
betas=(optim_cfg.beta1, optim_cfg.beta2),
**extra_args
)
print(f"using fused AdamW: {use_fused}")

return optimizer
Expand Down
Loading