Merge pull request #71 from imcaspar/dev

imcaspar · web-flow · commit 0140917218fa · 2020-05-29T12:58:26.000+08:00
v1.0 release
diff --git a/.gitignore b/.gitignore
@@ -129,4 +129,10 @@ dmypy.json
 .pyre/
 
 # vscode
-.vscode/
+.vscode/
+
+# dataset
+dataset/raw/
+
+# models
+models/
diff --git a/README.md b/README.md
@@ -14,17 +14,17 @@
 - [x] Ported bert tokenizer, multilingual corpus compatible
 - [x] 1.5B GPT2 pretrained Chinese model ( ~15G corpus, 10w steps )
 - [x] Batteries-included Colab demo [#](https://github.com/imcaspar/gpt2-ml#google-colab)
-- [ ] 1.5B GPT2 pretrained Chinese model ( ~50G corpus, 100w steps )
+- [x] 1.5B GPT2 pretrained Chinese model ( ~30G corpus, 22w steps )
 
 
 ## Pretrained Model
-1.5B GPT2 pretrained Chinese model [**[Google Drive]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n)
+1.5B GPT2 pretrained Chinese model ( ~30G corpus, 22w steps ) [**[Google Drive]**](https://drive.google.com/file/d/1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B) SHA256: e698cc97a7f5f706f84f58bb469d614e51d3c0ce5f9ab9bf77e01e3fcb41d482
 
-SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
+1.5B GPT2 pretrained Chinese model ( ~15G corpus, 10w steps ) [**[Google Drive]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n) SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
 
 Corpus from [THUCNews](http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews) and [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus)
 
-Using [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) to train 10w steps
+Using [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) to train 22w steps
 
 ![loss](./.github/loss.png)
 
diff --git a/README_CN.md b/README_CN.md
@@ -14,17 +14,17 @@
 - [x] 移植 bert tokenizer，添加多语言支持
 - [x] 15亿参数 GPT2 中文预训练模型( 15G 语料，训练 10w 步 )
 - [x] 开箱即用的模型生成效果 demo [#](https://github.com/imcaspar/gpt2-ml#google-colab)
-- [ ] 15亿参数 GPT2 中文预训练模型( 50G 语料，训练 100w 步 ，**预计 12 月初发布**)
+- [x] 15亿参数 GPT2 中文预训练模型( 30G 语料，训练 22w 步 )
 
 
 ## 预训练模型
-15 亿参数中文预训练模型 [**[Google Drive 下载]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n)
+15 亿参数中文预训练模型( 30G 语料，训练 22w 步 ) [**[Google Drive 下载]**](https://drive.google.com/file/d/1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B) SHA256: e698cc97a7f5f706f84f58bb469d614e51d3c0ce5f9ab9bf77e01e3fcb41d482
 
-SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
+15 亿参数中文预训练模型( 15G 语料，训练 10w 步 ) [**[Google Drive 下载]**](https://drive.google.com/file/d/1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n) SHA256: 4a6e5124df8db7ac2bdd902e6191b807a6983a7f5d09fb10ce011f9a073b183e
 
 训练语料来自 [THUCNews](http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews) 以及 [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus)，清洗后总文本量约 15G 
 
-使用 [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) 训练 10w 步
+使用 [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) 训练 22w 步
 
 ![loss](./.github/loss.png)
 
diff --git a/configs/large.json b/configs/large.json
@@ -1,5 +1,5 @@
 {
-  "vocab_size": 50270,
+  "vocab_size": 8021,
   "hidden_size": 1024,
   "attention_probs_dropout_prob": 0.1,
   "hidden_dropout_prob": 0.1,
@@ -8,6 +8,5 @@
   "intermediate_size": 4096,
   "max_position_embeddings": 1024,
   "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "max_batch_size_per_core": 3
+  "num_hidden_layers": 24
 }
diff --git a/configs/mega.json b/configs/mega.json
@@ -1,5 +1,5 @@
 {
-  "vocab_size": 21130,
+  "vocab_size": 8021,
   "hidden_size": 1536,
   "attention_probs_dropout_prob": 0.1,
   "hidden_dropout_prob": 0.1,
@@ -9,4 +9,4 @@
   "max_position_embeddings": 1024,
   "num_attention_heads": 24,
   "num_hidden_layers": 48
-}
+}
diff --git a/dataset/prepare_data.py b/dataset/prepare_data.py
@@ -69,21 +69,10 @@
     vocab_file="bert-base-chinese-vocab.txt", do_lower_case=True)
 
 
-class S3TFRecordWriter(object):
+class TFRecordWriter(object):
     def __init__(self, fn):
         self.fn = fn
-        if fn.startswith('s3://'):
-            from boto3.s3.transfer import TransferConfig
-            import boto3
-            self.gclient = None
-            self.s3client = boto3.client('s3',
-                                         )
-            self.storage_dir = TemporaryDirectory()
-            self.writer = tf.python_io.TFRecordWriter(
-                os.path.join(self.storage_dir.name, 'temp.tfrecord'))
-            self.bucket_name, self.file_name = self.fn.split(
-                's3://', 1)[1].split('/', 1)
-        elif fn.startswith('gs://'):
+        if fn.startswith('gs://'):
             from google.cloud import storage
             self.s3client = None
             self.gclient = storage.Client()
@@ -107,17 +96,6 @@ def write(self, x):
     def close(self):
         self.writer.close()
 
-        if self.s3client is not None:
-            from boto3.s3.transfer import TransferConfig
-            config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10,
-                                    multipart_chunksize=1024 * 25, use_threads=True)
-            self.s3client.upload_file(
-                os.path.join(self.storage_dir.name, 'temp.tfrecord'),
-                self.bucket_name,
-                self.file_name,
-                ExtraArgs={'ACL': 'public-read'}, Config=config,
-            )
-            self.storage_dir.cleanup()
         if self.gclient is not None:
             bucket = self.gclient.get_bucket(self.bucket_name)
             blob = bucket.blob(self.file_name)
@@ -183,7 +161,7 @@ def buffered_and_sliding_window_article_iterator(tokenizer, final_desired_size=1
 # OK now write the tfrecord file
 total_written = 0
 train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
-with S3TFRecordWriter(train_file) as train_writer:
+with TFRecordWriter(train_file) as train_writer:
     for article in buffered_and_sliding_window_article_iterator(tokenizer,
                                                                 final_desired_size=args.max_seq_length + 1):
         writer2use = train_writer
diff --git a/pretrained_model_demo.ipynb b/pretrained_model_demo.ipynb
@@ -1,24 +1,8 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "15 亿参数 GPT2 中文预训练模型 | 1.5B GPT2 Pretrained Chinese Model",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
       "cell_type": "markdown",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "[![GitHub stars](https://img.shields.io/github/stars/imcaspar/gpt2-ml?style=social)](https://github.com/imcaspar/gpt2-ml)\n",
         "[![GitHub](https://img.shields.io/github/license/imcaspar/gpt2-ml)](https://github.com/imcaspar/gpt2-ml)\n",
@@ -38,17 +22,17 @@
       "source": [
         "#@title #Prerequisites\n",
         "#%tensorflow_version 1.x\n",
-        "!pip install -I tensorflow==1.15.0 &> tmp.log\n",
+        "!pip install -I tensorflow==1.15.2 &> tmp.log\n",
         "!git clone -q https://github.com/imcaspar/gpt2-ml\n",
         "%cd /content/gpt2-ml\n",
         "!mkdir -p /content/gpt2-ml/models/mega\n",
         "\n",
-        "!perl 3rd/gdown.pl/gdown.pl https://drive.google.com/open?id=1IzWpQ6I2IgfV7CldZvFJnZ9byNDZdO4n models/mega/model.ckpt-100000.data-00000-of-00001\n",
-        "!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v0.5/model.ckpt-100000.index -P models/mega\n",
-        "!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v0.5/model.ckpt-100000.meta -P models/mega\n",
+        "!perl 3rd/gdown.pl/gdown.pl https://drive.google.com/open?id=1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B models/mega/model.ckpt-220000.data-00000-of-00001\n",
+        "!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v1.0/model.ckpt-220000.index -P models/mega\n",
+        "!wget -q --show-progress https://github.com/imcaspar/gpt2-ml/releases/download/v1.0/model.ckpt-220000.meta -P models/mega\n",
         "!echo 'Download finished.'\n",
         "# If gdown.pl failed, please uncomment following code & exec\n",
-        "# !python3 scripts/down_gdrive_file.py -file_id='1n_5-tgPpQ1gqbyLPbP1PwiFi2eo7SWw_' -file_path='models/mega/model.ckpt-100000.data-00000-of-00001'"
+        "# !python3 scripts/down_gdrive_file.py -file_id='1bhZhjNSgV1CGI-PR2NDzNtda5FAwN29B' -file_path='models/mega/model.ckpt-220000.data-00000-of-00001'"
       ]
     },
     {
@@ -60,8 +44,22 @@
         "#@title #Inference\n",
         "min_length = 150#@param {type:\"number\", min:5, max:1024, step:1}\n",
         "sample_num =   5#@param {type:\"number\", min:1, max:50, step:1}\n",
-        "!PYTHONPATH=$(pwd) python3 scripts/interactive_conditional_samples.py -model_config_fn configs/mega.json -model_ckpt models/mega/model.ckpt-100000 -eos_token 511 -min_len $min_length -samples $sample_num"
+        "!PYTHONPATH=$(pwd) python3 scripts/interactive_conditional_samples.py -model_config_fn configs/mega.json -model_ckpt models/mega/model.ckpt-220000 -eos_token 102 -min_len $min_length -samples $sample_num"
       ]
     }
-  ]
+  ],
+  "metadata": {
+    "colab": {
+      "name": "15 亿参数 GPT2 中文预训练模型 | 1.5B GPT2 Pretrained Chinese Model",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
@@ -1,8 +1,10 @@
 pandas==0.24.2
 regex==2019.4.14
-h5py==2.9.0
-numpy==1.16.2
-tensorboard==1.13.1
-tensorflow-gpu==1.13.1
+h5py==2.10.0
+numpy==1.18.4
+tensorboard==1.15.0
+tensorflow-gpu==1.15.2
+tensorflow-estimator==1.15.1
 tqdm==4.31.1
-requests==2.22.0
+requests==2.22.0
+ujson==2.0.3
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
@@ -1,9 +1,10 @@
 pandas==0.24.2
 regex==2019.4.14
-h5py==2.9.0
-numpy==1.16.2
-tensorboard==1.13.1
-tensorflow==1.13.1
-tensorflow-estimator==1.13.0
+h5py==2.10.0
+numpy==1.18.4
+tensorboard==1.15.0
+tensorflow==1.15.2
+tensorflow-estimator==1.15.1
 tqdm==4.31.1
-requests==2.22.0
+requests==2.22.0
+ujson==2.0.3
diff --git a/scripts/interactive_conditional_samples.py b/scripts/interactive_conditional_samples.py
@@ -141,7 +141,7 @@ def extract_generated_target(output_tokens, tokenizer):
 
 args = parser.parse_args()
 proj_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-vocab_file_path = os.path.join(proj_root_path, "tokenization/bert-base-chinese-vocab.txt")
+vocab_file_path = os.path.join(proj_root_path, "tokenization/clue-vocab.txt")
 tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path , do_lower_case=True)
 news_config = GroverConfig.from_json_file(args.model_config_fn)
 
diff --git a/tokenization/clue-vocab.txt b/tokenization/clue-vocab.txt
diff --git a/train/train_tpu.py b/train/train_tpu.py
diff --git a/train/train_tpu_adafactor.sh b/train/train_tpu_adafactor.sh

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "vocab_size": 50270,`
	`2`	`+ "vocab_size": 8021,`
`3`	`3`	`"hidden_size": 1024,`
`4`	`4`	`"attention_probs_dropout_prob": 0.1,`
`5`	`5`	`"hidden_dropout_prob": 0.1,`
`@@ -8,6 +8,5 @@`
`8`	`8`	`"intermediate_size": 4096,`
`9`	`9`	`"max_position_embeddings": 1024,`
`10`	`10`	`"num_attention_heads": 16,`
`11`		`- "num_hidden_layers": 24,`
`12`		`- "max_batch_size_per_core": 3`
	`11`	`+ "num_hidden_layers": 24`
`13`	`12`	`}`