Skip to content

Commit 0140917

Browse files
authored
Merge pull request #71 from imcaspar/dev
v1.0 release
2 parents 107a37c + fc8d2b1 commit 0140917

13 files changed

+8080
-75
lines changed

.gitignore

+7-1
Original file line numberDiff line numberDiff line change
@@ -129,4 +129,10 @@ dmypy.json
129129
.pyre/
130130

131131
# vscode
132-
.vscode/
132+
.vscode/
133+
134+
# dataset
135+
dataset/raw/
136+
137+
# models
138+
models/

README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@
1414
- [x] Ported bert tokenizer, multilingual corpus compatible
1515
- [x] 1.5B GPT2 pretrained Chinese model ( ~15G corpus, 10w steps )
1616
- [x] Batteries-included Colab demo [#](https://github.com/imcaspar/gpt2-ml#google-colab)
17-
- [ ] 1.5B GPT2 pretrained Chinese model ( ~50G corpus, 100w steps )
17+
- [x] 1.5B GPT2 pretrained Chinese model ( ~30G corpus, 22w steps )
1818

1919

2020
## Pretrained Model
21-
1.5B GPT2 pretrained Chinese model [**[Google Drive]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n)
21+
1.5B GPT2 pretrained Chinese model ( ~30G corpus, 22w steps ) [**[Google Drive]**](https://drive.google.com/file/d/1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B) SHA256: e698cc97a7f5f706f84f58bb469d614e51d3c0ce5f9ab9bf77e01e3fcb41d482
2222

23-
SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
23+
1.5B GPT2 pretrained Chinese model ( ~15G corpus, 10w steps ) [**[Google Drive]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n) SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
2424

2525
Corpus from [THUCNews](http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews) and [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus)
2626

27-
Using [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) to train 10w steps
27+
Using [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) to train 22w steps
2828

2929
![loss](./.github/loss.png)
3030

README_CN.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@
1414
- [x] 移植 bert tokenizer,添加多语言支持
1515
- [x] 15亿参数 GPT2 中文预训练模型( 15G 语料,训练 10w 步 )
1616
- [x] 开箱即用的模型生成效果 demo [#](https://github.com/imcaspar/gpt2-ml#google-colab)
17-
- [ ] 15亿参数 GPT2 中文预训练模型( 50G 语料,训练 100w**预计 12 月初发布**)
17+
- [x] 15亿参数 GPT2 中文预训练模型( 30G 语料,训练 22w 步 )
1818

1919

2020
## 预训练模型
21-
15 亿参数中文预训练模型 [**[Google Drive 下载]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n)
21+
15 亿参数中文预训练模型( 30G 语料,训练 22w 步 ) [**[Google Drive 下载]**](https://drive.google.com/file/d/1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B) SHA256: e698cc97a7f5f706f84f58bb469d614e51d3c0ce5f9ab9bf77e01e3fcb41d482
2222

23-
SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
23+
15 亿参数中文预训练模型( 15G 语料,训练 10w 步 ) [**[Google Drive 下载]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n) SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
2424

2525
训练语料来自 [THUCNews](http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews) 以及 [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus),清洗后总文本量约 15G
2626

27-
使用 [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) 训练 10w
27+
使用 [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) 训练 22w
2828

2929
![loss](./.github/loss.png)
3030

configs/large.json

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"vocab_size": 50270,
2+
"vocab_size": 8021,
33
"hidden_size": 1024,
44
"attention_probs_dropout_prob": 0.1,
55
"hidden_dropout_prob": 0.1,
@@ -8,6 +8,5 @@
88
"intermediate_size": 4096,
99
"max_position_embeddings": 1024,
1010
"num_attention_heads": 16,
11-
"num_hidden_layers": 24,
12-
"max_batch_size_per_core": 3
11+
"num_hidden_layers": 24
1312
}

configs/mega.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"vocab_size": 21130,
2+
"vocab_size": 8021,
33
"hidden_size": 1536,
44
"attention_probs_dropout_prob": 0.1,
55
"hidden_dropout_prob": 0.1,
@@ -9,4 +9,4 @@
99
"max_position_embeddings": 1024,
1010
"num_attention_heads": 24,
1111
"num_hidden_layers": 48
12-
}
12+
}

dataset/prepare_data.py

+3-25
Original file line numberDiff line numberDiff line change
@@ -69,21 +69,10 @@
6969
vocab_file="bert-base-chinese-vocab.txt", do_lower_case=True)
7070

7171

72-
class S3TFRecordWriter(object):
72+
class TFRecordWriter(object):
7373
def __init__(self, fn):
7474
self.fn = fn
75-
if fn.startswith('s3://'):
76-
from boto3.s3.transfer import TransferConfig
77-
import boto3
78-
self.gclient = None
79-
self.s3client = boto3.client('s3',
80-
)
81-
self.storage_dir = TemporaryDirectory()
82-
self.writer = tf.python_io.TFRecordWriter(
83-
os.path.join(self.storage_dir.name, 'temp.tfrecord'))
84-
self.bucket_name, self.file_name = self.fn.split(
85-
's3://', 1)[1].split('/', 1)
86-
elif fn.startswith('gs://'):
75+
if fn.startswith('gs://'):
8776
from google.cloud import storage
8877
self.s3client = None
8978
self.gclient = storage.Client()
@@ -107,17 +96,6 @@ def write(self, x):
10796
def close(self):
10897
self.writer.close()
10998

110-
if self.s3client is not None:
111-
from boto3.s3.transfer import TransferConfig
112-
config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10,
113-
multipart_chunksize=1024 * 25, use_threads=True)
114-
self.s3client.upload_file(
115-
os.path.join(self.storage_dir.name, 'temp.tfrecord'),
116-
self.bucket_name,
117-
self.file_name,
118-
ExtraArgs={'ACL': 'public-read'}, Config=config,
119-
)
120-
self.storage_dir.cleanup()
12199
if self.gclient is not None:
122100
bucket = self.gclient.get_bucket(self.bucket_name)
123101
blob = bucket.blob(self.file_name)
@@ -183,7 +161,7 @@ def buffered_and_sliding_window_article_iterator(tokenizer, final_desired_size=1
183161
# OK now write the tfrecord file
184162
total_written = 0
185163
train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
186-
with S3TFRecordWriter(train_file) as train_writer:
164+
with TFRecordWriter(train_file) as train_writer:
187165
for article in buffered_and_sliding_window_article_iterator(tokenizer,
188166
final_desired_size=args.max_seq_length + 1):
189167
writer2use = train_writer

pretrained_model_demo.ipynb

+21-23
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,8 @@
11
{
2-
"nbformat": 4,
3-
"nbformat_minor": 0,
4-
"metadata": {
5-
"colab": {
6-
"name": "15 亿参数 GPT2 中文预训练模型 | 1.5B GPT2 Pretrained Chinese Model",
7-
"provenance": [],
8-
"collapsed_sections": []
9-
},
10-
"kernelspec": {
11-
"name": "python3",
12-
"display_name": "Python 3"
13-
},
14-
"accelerator": "GPU"
15-
},
162
"cells": [
173
{
184
"cell_type": "markdown",
19-
"execution_count": null,
205
"metadata": {},
21-
"outputs": [],
226
"source": [
237
"[![GitHub stars](https://img.shields.io/github/stars/imcaspar/gpt2-ml?style=social)](https://github.com/imcaspar/gpt2-ml)\n",
248
"[![GitHub](https://img.shields.io/github/license/imcaspar/gpt2-ml)](https://github.com/imcaspar/gpt2-ml)\n",
@@ -38,17 +22,17 @@
3822
"source": [
3923
"#@title #Prerequisites\n",
4024
"#%tensorflow_version 1.x\n",
41-
"!pip install -I tensorflow==1.15.0 &> tmp.log\n",
25+
"!pip install -I tensorflow==1.15.2 &> tmp.log\n",
4226
"!git clone -q https://github.com/imcaspar/gpt2-ml\n",
4327
"%cd /content/gpt2-ml\n",
4428
"!mkdir -p /content/gpt2-ml/models/mega\n",
4529
"\n",
46-
"!perl 3rd/gdown.pl/gdown.pl https://drive.google.com/open?id=1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n models/mega/model.ckpt-100000.data-00000-of-00001\n",
47-
"!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v0.5/model.ckpt-100000.index -P models/mega\n",
48-
"!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v0.5/model.ckpt-100000.meta -P models/mega\n",
30+
"!perl 3rd/gdown.pl/gdown.pl https://drive.google.com/open?id=1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B models/mega/model.ckpt-220000.data-00000-of-00001\n",
31+
"!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v1.0/model.ckpt-220000.index -P models/mega\n",
32+
"!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v1.0/model.ckpt-220000.meta -P models/mega\n",
4933
"!echo 'Download finished.'\n",
5034
"# If gdown.pl failed, please uncomment following code & exec\n",
51-
"# !python3 scripts/down_gdrive_file.py -file_id='1n_5-tgPpQ1gqbyLPbP1PwiFi2eo7SWw_' -file_path='models/mega/model.ckpt-100000.data-00000-of-00001'"
35+
"# !python3 scripts/down_gdrive_file.py -file_id='1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B' -file_path='models/mega/model.ckpt-220000.data-00000-of-00001'"
5236
]
5337
},
5438
{
@@ -60,8 +44,22 @@
6044
"#@title #Inference\n",
6145
"min_length = 150#@param {type:\"number\", min:5, max:1024, step:1}\n",
6246
"sample_num = 5#@param {type:\"number\", min:1, max:50, step:1}\n",
63-
"!PYTHONPATH=$(pwd) python3 scripts/interactive_conditional_samples.py -model_config_fn configs/mega.json -model_ckpt models/mega/model.ckpt-100000 -eos_token 511 -min_len $min_length -samples $sample_num"
47+
"!PYTHONPATH=$(pwd) python3 scripts/interactive_conditional_samples.py -model_config_fn configs/mega.json -model_ckpt models/mega/model.ckpt-220000 -eos_token 102 -min_len $min_length -samples $sample_num"
6448
]
6549
}
66-
]
50+
],
51+
"metadata": {
52+
"colab": {
53+
"name": "15 亿参数 GPT2 中文预训练模型 | 1.5B GPT2 Pretrained Chinese Model",
54+
"provenance": [],
55+
"collapsed_sections": []
56+
},
57+
"kernelspec": {
58+
"name": "python3",
59+
"display_name": "Python 3"
60+
},
61+
"accelerator": "GPU"
62+
},
63+
"nbformat": 4,
64+
"nbformat_minor": 0
6765
}

requirements-gpu.txt

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
pandas==0.24.2
22
regex==2019.4.14
3-
h5py==2.9.0
4-
numpy==1.16.2
5-
tensorboard==1.13.1
6-
tensorflow-gpu==1.13.1
3+
h5py==2.10.0
4+
numpy==1.18.4
5+
tensorboard==1.15.0
6+
tensorflow-gpu==1.15.2
7+
tensorflow-estimator==1.15.1
78
tqdm==4.31.1
8-
requests==2.22.0
9+
requests==2.22.0
10+
ujson==2.0.3

requirements-tpu.txt

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
pandas==0.24.2
22
regex==2019.4.14
3-
h5py==2.9.0
4-
numpy==1.16.2
5-
tensorboard==1.13.1
6-
tensorflow==1.13.1
7-
tensorflow-estimator==1.13.0
3+
h5py==2.10.0
4+
numpy==1.18.4
5+
tensorboard==1.15.0
6+
tensorflow==1.15.2
7+
tensorflow-estimator==1.15.1
88
tqdm==4.31.1
9-
requests==2.22.0
9+
requests==2.22.0
10+
ujson==2.0.3

scripts/interactive_conditional_samples.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def extract_generated_target(output_tokens, tokenizer):
141141

142142
args = parser.parse_args()
143143
proj_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
144-
vocab_file_path = os.path.join(proj_root_path, "tokenization/bert-base-chinese-vocab.txt")
144+
vocab_file_path = os.path.join(proj_root_path, "tokenization/clue-vocab.txt")
145145
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path , do_lower_case=True)
146146
news_config = GroverConfig.from_json_file(args.model_config_fn)
147147

0 commit comments

Comments
 (0)