diff --git a/scripts/vocab_extension/train_merge_tokenizer.sh b/scripts/vocab_extension/train_merge_tokenizer.sh index 43398613a..2e63e84b6 100644 --- a/scripts/vocab_extension/train_merge_tokenizer.sh +++ b/scripts/vocab_extension/train_merge_tokenizer.sh @@ -14,7 +14,8 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat --model_type bpe \ --output_dir ./output_models/new_tokenizer \ --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \ - --vocab_size 20000 + --vocab_size 20000 \ + --max_sentencepiece_length 4 # merge the new tokenizer with the old one mkdir -p ./output_models/merged_tokenizer diff --git a/scripts/vocab_extension/train_tokenizer.sh b/scripts/vocab_extension/train_tokenizer.sh index f58347b6b..d61275499 100644 --- a/scripts/vocab_extension/train_tokenizer.sh +++ b/scripts/vocab_extension/train_tokenizer.sh @@ -4,4 +4,5 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat --model_type bpe \ --output_dir ./output_models/new_tokenizer \ --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \ - --vocab_size 20000 \ No newline at end of file + --vocab_size 20000 \ + --max_sentencepiece_length 4 \ No newline at end of file diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py index 81cd0109e..e0fdd87e7 100644 --- a/utils/merge_tokenizer.py +++ b/utils/merge_tokenizer.py @@ -81,5 +81,5 @@ text='''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' logging.info(f"Test text:\n %s",text) - logging.info(f"Tokenized by LLaMA tokenizer:%s",old_tokenizer.tokenize(text)) - logging.info(f"Tokenized by Chinese-LLaMA tokenizer:%s",new_tokenizer.tokenize(text)) \ No newline at end of file + logging.info(f"Tokenized by original tokenizer:%s",old_tokenizer.tokenize(text)) + logging.info(f"Tokenized by merged tokenizer:%s",new_tokenizer.tokenize(text)) \ No newline at end of file diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py index 31b1f79b5..48ce3c2b2 100644 --- a/utils/train_tokenizer.py +++ b/utils/train_tokenizer.py @@ -13,6 +13,7 @@ parser.add_argument('--vocab_size', default=20000, type=int, required=False) parser.add_argument('--model_type', default='bpe', type=str, required=False) parser.add_argument('--user_defined_symbols', default='0,1,2,3,4,5,6,7,8,9,%', type=str, required=False) + parser.add_argument('--max_sentencepiece_length', default=4, type=int, required=False) args = parser.parse_args() dataset_path = args.dataset_path @@ -20,10 +21,11 @@ vocab_size = args.vocab_size model_type = args.model_type user_defined_symbols = args.user_defined_symbols - + max_sentencepiece_length=args.max_sentencepiece_length + def mkdir(path): if not os.path.exists(path): os.makedirs(path) mkdir(output_dir) - spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols)) \ No newline at end of file + spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --max_sentencepiece_length={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols,max_sentencepiece_length)) \ No newline at end of file