Skip to content

Commit

Permalink
improve performance of training tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
yaoguany committed Aug 7, 2023
1 parent d5dc682 commit 137c65c
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 6 deletions.
3 changes: 2 additions & 1 deletion scripts/vocab_extension/train_merge_tokenizer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat
--model_type bpe \
--output_dir ./output_models/new_tokenizer \
--user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
--vocab_size 20000
--vocab_size 20000 \
--max_sentencepiece_length 4

# merge the new tokenizer with the old one
mkdir -p ./output_models/merged_tokenizer
Expand Down
3 changes: 2 additions & 1 deletion scripts/vocab_extension/train_tokenizer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat
--model_type bpe \
--output_dir ./output_models/new_tokenizer \
--user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
--vocab_size 20000
--vocab_size 20000 \
--max_sentencepiece_length 4
4 changes: 2 additions & 2 deletions utils/merge_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,5 @@
text='''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。
The primary use of LLaMA is research on large language models, including'''
logging.info(f"Test text:\n %s",text)
logging.info(f"Tokenized by LLaMA tokenizer:%s",old_tokenizer.tokenize(text))
logging.info(f"Tokenized by Chinese-LLaMA tokenizer:%s",new_tokenizer.tokenize(text))
logging.info(f"Tokenized by original tokenizer:%s",old_tokenizer.tokenize(text))
logging.info(f"Tokenized by merged tokenizer:%s",new_tokenizer.tokenize(text))
6 changes: 4 additions & 2 deletions utils/train_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@
parser.add_argument('--vocab_size', default=20000, type=int, required=False)
parser.add_argument('--model_type', default='bpe', type=str, required=False)
parser.add_argument('--user_defined_symbols', default='0,1,2,3,4,5,6,7,8,9,%', type=str, required=False)
parser.add_argument('--max_sentencepiece_length', default=4, type=int, required=False)
args = parser.parse_args()

dataset_path = args.dataset_path
output_dir = args.output_dir
vocab_size = args.vocab_size
model_type = args.model_type
user_defined_symbols = args.user_defined_symbols

max_sentencepiece_length=args.max_sentencepiece_length

def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
mkdir(output_dir)

spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols))
spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --max_sentencepiece_length={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols,max_sentencepiece_length))

0 comments on commit 137c65c

Please sign in to comment.