forked from karpathy/nanoGPT
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathscale_gpt.py
47 lines (37 loc) · 1.41 KB
/
scale_gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# config for scaling GPT following Kaplan et al.
wandb_log = True
wandb_project = 'owt-scaling'
wandb_run_id = "" # give only when resuming a W&B run
always_save_checkpoint = False
# setting default values of scale_N, scale_D to False, you must change them from command line when scaling.
scaling = "Kaplan"
scale_N = False
scale_D = False
# replace n_layer, n_embd and fraction_of_data from command line. Default values:
n_layer = 4
n_embd = 128
fraction_of_data = 1.0
# Can also set n_head from command line, but set default value through this rule of thumb
# given in Appendix F of 2010.14701
# It is consistent with nanoGPT where n_embd = 768, and n_head = 768//64 = 12.
n_head = max(2, n_embd // 64)
#### TRAINING CONFIGURATIONS FROM KAPLAN ET AL
# total batch size = 512 so set local batch size = 16, gradaccum = 32
batch_size = 16
block_size = 1024
gradient_accumulation_steps = 4 * 8
# total number of training iterations = 2.5e5
# learning rate warms up for 3000 iterations and decays to 0 at the end of training.
# dropout = 0.1 (see Section 4.2). minimum learning rate is 0
# maximum learning rate is given by equation D.1 of the paper. It depends on N, so we set it in configurator.py
max_iters = int(2.5e5)
warmup_iters = 3000
lr_decay_iters = int(2.5e5)
dropout = 0.1
min_lr = 0
# eval stuff same as nanoGPT
eval_interval = 1000
eval_iters = 200
log_interval = 10
# weight decay same as nanoGPT
weight_decay = 1e-1