Skip to content

Commit

Permalink
added CPT code
Browse files Browse the repository at this point in the history
  • Loading branch information
bentherien committed Apr 14, 2024
1 parent 5cdff76 commit 6ff3ae6
Show file tree
Hide file tree
Showing 102 changed files with 2,244 additions and 12 deletions.
25 changes: 25 additions & 0 deletions configs/datasets/train/pile+slim_pajama_300B_each.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
# This will sample with equal likelihood Pile and SlimPajama:
"train-data-paths": [
"data/pile/train/pile_train",
'data/slim_pajama/train_300B/ArXiv/ArXiv',
'data/slim_pajama/train_300B/Book/Book',
'data/slim_pajama/train_300B/C4/C4',
'data/slim_pajama/train_300B/Wikipedia/Wikipedia',
'data/slim_pajama/train_300B/Github/Github',
'data/slim_pajama/train_300B/StackExchange/StackExchange',
'data/slim_pajama/train_300B/CommonCrawl/CommonCrawl',],
"train-data-weights": [
50.0,
2.2140923205,
2.101565663,
13.344249736,
1.9986465625,
2.612070528,
1.6855393625,
26.0438358255
],
"train-dataset-name": 'pile+slim_pajama_300B_each',
"train-iters": 264732,
"lr-decay-iters": 264732,
}
33 changes: 33 additions & 0 deletions configs/datasets/train/pile_shard0.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"train-data-paths": [
"data/pile/shard_0/shard_0_text_document",
],
"train-data-weights": [
1.,
],
"train-dataset-name": 'pile_shard0',
"train-iters": 1000,
"lr-decay-iters": 1000,
"is_replay_enabled": true,
"replay_config": {
"enabled": true,
# Have to specify idx filenames from original pretraining on tasks, as they contain the num iterations
# and seen indices assuming we're using the same (non-replay) seed as during pretraining
"replay_idx_paths_prefixes": [
"data/pile/shard_0/shard_0_text_document_train_0_indexmap_32160ns_2048sl_1234s",
],
"replay_data_weights":[
1.00,
],
"replay_idx_offsets": [
1,
],
# Fraction of samples coming from the replay buffer, between 0 and 1.
"replay_fraction": 0.5,
# Seed and reshuffle go hand in hand. They control whether you want to see the replay data in the same order
# as you've seen it (done by setting reshuffle to false), and if you decide to reshuffle, what seed you should
# use to reshuffle the seen data.
"replay_seed": 1234,
"replay_reshuffle_idx": false,
},
}
11 changes: 11 additions & 0 deletions configs/datasets/train/pile_train.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"train-data-paths": [
"data/pile/train/pile_train",
],
"train-data-weights": [
1.,
],
"train-dataset-name": 'pile_train',
"train-iters": 132366,
"lr-decay-iters": 132366,
}
32 changes: 32 additions & 0 deletions configs/datasets/train/rp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
# or for weighted datasets:
"train-data-paths": [
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/arxiv/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/book/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/c4/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/wikipedia/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/github/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/stackexchange/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2019-30/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2020-05/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2021-04/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2022-05/folder_train/tokenized_text_document",
"/gpfs/alpine/csc499/proj-shared/incite_datasets/SlimPajama/tokenized300B/train_splits/common_crawl/2023-06/folder_train/tokenized_text_document",
],
"train-data-weights": [
2.5,
4.5,
15.0,
4.5,
4.5,
2.0,
13.4,
13.4,
13.4,
13.4,
13.4
],
"train-dataset-name": 'rp',


}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_100B_1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_0-100B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_0-100B/Book/Book',
'data/slim_pajama/tokenized_train_0-100B/C4/C4',
'data/slim_pajama/tokenized_train_0-100B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_0-100B/Github/Github',
'data/slim_pajama/tokenized_train_0-100B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_0-100B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
3.4703977435152775,
3.904381603212791,
25.641950653802013,
3.804228253591696,
4.9994643949282045,
3.1815838172641993,
49.99799353368582,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_1',
}
28 changes: 28 additions & 0 deletions configs/datasets/train/slim_pajama_100B_1_replay5.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_0-100B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_0-100B/Book/Book',
'data/slim_pajama/tokenized_train_0-100B/C4/C4',
'data/slim_pajama/tokenized_train_0-100B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_0-100B/Github/Github',
'data/slim_pajama/tokenized_train_0-100B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_0-100B/CommonCrawl/CommonCrawl',

'data/pile_replay_shards/replay_10B_1/splits',
],
"train-data-weights": [
3.4703977435152775,
3.904381603212791,
25.641950653802013,
3.804228253591696,
4.9994643949282045,
3.1815838172641993,
49.99799353368582,

5.0
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_1_replay5',
}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_100B_2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_100B-200B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_100B-200B/Book/Book',
'data/slim_pajama/tokenized_train_100B-200B/C4/C4',
'data/slim_pajama/tokenized_train_100B-200B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_100B-200B/Github/Github',
'data/slim_pajama/tokenized_train_100B-200B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_100B-200B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
4.03666599074094,
3.927523855378127,
25.467175464208918,
3.7984379710376293,
4.990226864678155,
3.1957646326079723,
49.58420522134826,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_2',
}
45 changes: 45 additions & 0 deletions configs/datasets/train/slim_pajama_100B_2_replay5.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_100B-200B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_100B-200B/Book/Book',
'data/slim_pajama/tokenized_train_100B-200B/C4/C4',
'data/slim_pajama/tokenized_train_100B-200B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_100B-200B/Github/Github',
'data/slim_pajama/tokenized_train_100B-200B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_100B-200B/CommonCrawl/CommonCrawl',

'data/pile_replay_shards/replay_10B_2/splits',

'data/sp_replay_shards/100B_1_shard1/ArXiv/ArXiv',
'data/sp_replay_shards/100B_1_shard1/Book/Book',
'data/sp_replay_shards/100B_1_shard1/C4/C4',
'data/sp_replay_shards/100B_1_shard1/Wikipedia/Wikipedia',
'data/sp_replay_shards/100B_1_shard1/Github/Github',
'data/sp_replay_shards/100B_1_shard1/StackExchange/StackExchange',
'data/sp_replay_shards/100B_1_shard1/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
4.03666599074094,
3.927523855378127,
25.467175464208918,
3.7984379710376293,
4.990226864678155,
3.1957646326079723,
49.58420522134826,

3.8125,

# total: 1.1875,
0.04337997179394097,
0.04880477004015989,
0.3205243831725252,
0.0475528531698962,
0.06249330493660256,
0.03976979771580249,
0.6249749191710727,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_2_replay5',
}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_100B_3.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_200B-300B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_200B-300B/Book/Book',
'data/slim_pajama/tokenized_train_200B-300B/C4/C4',
'data/slim_pajama/tokenized_train_200B-300B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_200B-300B/Github/Github',
'data/slim_pajama/tokenized_train_200B-300B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_200B-300B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
3.491756366873565,
4.084283062119696,
25.524317038754475,
3.8109321899190314,
4.89534056131328,
3.254459546224121,
49.93891123479581,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_3',
}
61 changes: 61 additions & 0 deletions configs/datasets/train/slim_pajama_100B_3_replay5.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_200B-300B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_200B-300B/Book/Book',
'data/slim_pajama/tokenized_train_200B-300B/C4/C4',
'data/slim_pajama/tokenized_train_200B-300B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_200B-300B/Github/Github',
'data/slim_pajama/tokenized_train_200B-300B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_200B-300B/CommonCrawl/CommonCrawl',

'data/pile_replay_shards/replay_10B_3/splits',

'data/sp_replay_shards/100B_1_shard2/ArXiv/ArXiv',
'data/sp_replay_shards/100B_1_shard2/Book/Book',
'data/sp_replay_shards/100B_1_shard2/C4/C4',
'data/sp_replay_shards/100B_1_shard2/Wikipedia/Wikipedia',
'data/sp_replay_shards/100B_1_shard2/Github/Github',
'data/sp_replay_shards/100B_1_shard2/StackExchange/StackExchange',
'data/sp_replay_shards/100B_1_shard2/CommonCrawl/CommonCrawl',

'data/sp_replay_shards/100B_2_shard1/ArXiv/ArXiv',
'data/sp_replay_shards/100B_2_shard1/Book/Book',
'data/sp_replay_shards/100B_2_shard1/C4/C4',
'data/sp_replay_shards/100B_2_shard1/Wikipedia/Wikipedia',
'data/sp_replay_shards/100B_2_shard1/Github/Github',
'data/sp_replay_shards/100B_2_shard1/StackExchange/StackExchange',
'data/sp_replay_shards/100B_2_shard1/CommonCrawl/CommonCrawl',
],
"train-data-weights": [3.491756366873565,
4.084283062119696,
25.524317038754475,
3.8109321899190314,
4.89534056131328,
3.254459546224121,
49.93891123479581,

3.088125,

# total: 0.961875,
0.03513777715309219,
0.03953186373252951,
0.2596247503697454,
0.03851781106761592,
0.05061957699864807,
0.03221353614980002,
0.506229684528569,

#total: 0.95,
0.0403666599074094,
0.03927523855378127,
0.25467175464208913,
0.03798437971037629,
0.049902268646781545,
0.03195764632607972,
0.4958420522134826,
],
"train-iters": 44229,
"lr-decay-iters": 44229,
"train-dataset-name": 'slim_pajama_100B_3_replay5',
}
23 changes: 23 additions & 0 deletions configs/datasets/train/slim_pajama_150B.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/train_150B/ArXiv/ArXiv',
'data/slim_pajama/train_150B/Book/Book',
'data/slim_pajama/train_150B/C4/C4',
'data/slim_pajama/train_150B/Wikipedia/Wikipedia',
'data/slim_pajama/train_150B/Github/Github',
'data/slim_pajama/train_150B/StackExchange/StackExchange',
'data/slim_pajama/train_150B/CommonCrawl/CommonCrawl',],
"train-data-weights": [
4.576447650075095,
4.198505982426652,
26.62982374026485,
3.9945183507095225,
5.218824282422116,
3.372167199706489,
52.00971279439528
],
"train-dataset-name": 'slim_pajama_150B',
"train-iters": 66342,
"lr-decay-iters": 66342,
}
24 changes: 24 additions & 0 deletions configs/datasets/train/slim_pajama_200B_1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
# or for weighted datasets:
"train-data-paths": [
'data/slim_pajama/tokenized_train_0-200B/ArXiv/ArXiv',
'data/slim_pajama/tokenized_train_0-200B/Book/Book',
'data/slim_pajama/tokenized_train_0-200B/C4/C4',
'data/slim_pajama/tokenized_train_0-200B/Wikipedia/Wikipedia',
'data/slim_pajama/tokenized_train_0-200B/Github/Github',
'data/slim_pajama/tokenized_train_0-200B/StackExchange/StackExchange',
'data/slim_pajama/tokenized_train_0-200B/CommonCrawl/CommonCrawl',
],
"train-data-weights": [
3.4703977435152775,
3.904381603212791,
25.641950653802013,
3.804228253591696,
4.9994643949282045,
3.1815838172641993,
49.99799353368582,
],
"train-iters": 88457,
"lr-decay-iters": 88457,
"train-dataset-name": 'slim_pajama_200B_1',
}
Loading

0 comments on commit 6ff3ae6

Please sign in to comment.