Skip to content

Commit

Permalink
optimize the way of appending. (#126)
Browse files Browse the repository at this point in the history
* optimize the way of appending.

* optimize the way of appending.

---------

Co-authored-by: wheatxzhang <[email protected]>
  • Loading branch information
Winter523 and wheatxzhang authored Apr 25, 2024
1 parent c13cf17 commit dc155e4
Show file tree
Hide file tree
Showing 12 changed files with 70 additions and 86 deletions.
8 changes: 4 additions & 4 deletions finetune/run_c3.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset[-1][0].append(src)
dataset[-1][2].append(seg)
Expand Down
6 changes: 3 additions & 3 deletions finetune/run_chid.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def read_dataset(args, data_path, answer_path):
src = args.tokenizer.convert_tokens_to_ids(tokens)[: args.seq_length]
seg = [0] * len(src)

while len(src) < args.seq_length:
src.append(0)
seg.append(0)
if len(src) < args.seq_length:
src += [0] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset[-1][0].append(src)
dataset[-1][2].append(seg)
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
if args.soft_targets and "logits" in columns.keys():
dataset.append((src, tgt, seg, soft_tgt))
else:
Expand Down
9 changes: 5 additions & 4 deletions finetune/run_classifier_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

if args.soft_targets and "logits" in columns.keys():
dataset[index].append((src, tgt, seg, 0, soft_tgt))
else:
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_classifier_multi_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset.append((src, tgt, seg))

Expand Down
8 changes: 4 additions & 4 deletions finetune/run_classifier_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ def read_dataset(args, path):
src = src[: args.seq_length]
seg = seg[: args.seq_length]

PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
tgt = [0] * len(src)
# Ignore the sentence which the answer is not in a sequence
if mask_position >= args.seq_length:
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_cmrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,10 @@ def convert_examples_to_dataset(args, examples):
src_b = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(span_context) + [SEP_TOKEN])
src = src_a + src_b
seg = [1] * len(src_a) + [2] * len(src_b)
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset.append((src, seg, start_position, end_position, answers, question_id, len(question), doc_span_index, start_offset))
return dataset
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_dbqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
dataset.append((src, tgt, seg, qid))

return dataset
Expand Down
10 changes: 5 additions & 5 deletions finetune/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ def read_dataset(args, path):
src = src[: args.seq_length]
tgt = tgt[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
tgt.append(args.labels_num - 1)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
tgt += [args.labels_num - 1] * (args.seq_length - len(tgt))
seg += [0] * (args.seq_length - len(seg))
dataset.append([src, tgt, seg])

return dataset
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
dataset.append((src, tgt, seg))

return dataset
Expand Down
14 changes: 7 additions & 7 deletions finetune/run_text2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,13 @@ def read_dataset(args, path):
tgt_seg = tgt_seg[: args.tgt_seq_length]
tgt_out = tgt_in[1:] + [PAD_ID]

while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
while len(tgt_in) < args.tgt_seq_length:
tgt_in.append(PAD_ID)
tgt_out.append(PAD_ID)
tgt_seg.append(0)
if len(src) < args.seq_length:
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
if len(tgt_in) < args.tgt_seq_length:
tgt_in += [PAD_ID] * (args.tgt_seq_length - len(tgt_in))
tgt_out += [PAD_ID] * (args.tgt_seq_length - len(tgt_out))
tgt_seg += [0] * (args.tgt_seq_length - len(tgt_seg))

dataset.append((src, tgt_in, tgt_out, seg, tgt_seg))

Expand Down
61 changes: 22 additions & 39 deletions tencentpretrain/utils/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num

if len(ins) == 4:
src.append(src_single)
Expand Down Expand Up @@ -125,8 +124,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num

if len(ins) == 3:
src.append(src_single)
Expand Down Expand Up @@ -177,8 +175,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single[:-1])
tgt.append(src_single[1:])
seg.append([1] * ins[1][0] + [0] * (len(src_single) - 1 - ins[1][0]))
Expand Down Expand Up @@ -212,10 +209,9 @@ def __iter__(self):
for ins in instances:
src_single, pad_num = ins[0]
tgt_forward_single, tgt_backward_single = ins[1], ins[2]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
tgt_forward_single.append(self.vocab.get(PAD_TOKEN))
tgt_backward_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_forward_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_backward_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single)
tgt_forward.append(tgt_forward_single)
tgt_backward.append(tgt_backward_single)
Expand Down Expand Up @@ -247,11 +243,9 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_single, pad_num = ins[1]
for _ in range(pad_num):
tgt_single.append(self.vocab.get(PAD_TOKEN))
tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num

src.append(src_single)
tgt_in.append(tgt_single[:-1])
Expand Down Expand Up @@ -289,8 +283,7 @@ def __iter__(self):

for _, ins in enumerate(instances):
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num

if len(ins) == 3:
tgt_single = ins[1]
Expand Down Expand Up @@ -376,11 +369,9 @@ def __iter__(self):

for _, ins in enumerate(instances):
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_single, pad_num = ins[1]
for _ in range(pad_num):
tgt_single.append(self.vocab.get(PAD_TOKEN))
tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num

src_single, _ = mask_seq(src_single, self.tokenizer, self.whole_word_masking, self.span_masking,
self.span_geo_prob, self.span_max_length)
Expand Down Expand Up @@ -442,9 +433,8 @@ def __iter__(self):
elif len(seg_pos_single) == 2:
seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1]

for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
seg_single.append(0)
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
seg_single += [0] * pad_num

src.append(src_single)
tgt.append(ins[1])
Expand Down Expand Up @@ -474,9 +464,8 @@ def __iter__(self):
for ins in instances:
src_single, pad_num = ins[0]
tgt_single = ins[1]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
tgt_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single)
tgt.append(tgt_single)
seg.append([1] * ins[2][0] + [2] * (ins[2][1] - ins[2][0]) + [0] * (len(src_single) - ins[2][1]))
Expand Down Expand Up @@ -515,9 +504,8 @@ def __iter__(self):
elif len(seg_pos_single) == 2:
seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1]

for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
seg_single.append(0)
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
seg_single += [0] * pad_num
seg.append(seg_single)

if len(ins) == 4 :
Expand Down Expand Up @@ -643,8 +631,7 @@ def __iter__(self):

for ins in instances:
src_text_single, pad_num = ins[0]
for _ in range(pad_num):
src_text_single.append(self.vocab.get(PAD_TOKEN))
src_text_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src_text_single, tgt_mlm_single = mask_seq(src_text_single, self.tokenizer, self.whole_word_masking, self.span_masking, self.span_geo_prob, self.span_max_length)
src_text.append(src_text_single)
masked_words_num += len(tgt_mlm_single)
Expand Down Expand Up @@ -709,8 +696,7 @@ def __iter__(self):
seg_image = []
for ins in instances:
src_text_single, pad_num = ins[0]
for _ in range(pad_num):
src_text_single.append(self.vocab.get(PAD_TOKEN))
src_text_single += [self.vocab.get(PAD_TOKEN)] * pad_num

src_text.append(src_text_single)
seg_text.append([1] * ins[1][0] + [0] * pad_num)
Expand Down Expand Up @@ -788,8 +774,7 @@ def __iter__(self):

for ins in instances:
text_single, pad_num = ins[0]
for _ in range(pad_num):
text_single.append(self.vocab.get(PAD_TOKEN))
text_single += [self.vocab.get(PAD_TOKEN)] * pad_num

waveform, _ = torchaudio.load(ins[2]) # waveform, sample_rate
waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers
Expand Down Expand Up @@ -924,8 +909,7 @@ def __iter__(self):
image = self.transform(image)
image_tokens = [i + self.vocab_bias for i in image_tokenize(self.vqgan, image)]
src_single.extend(image_tokens)
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
seg_single = [1] * ins[1][0] + [2] * len(image_tokens) + [0] * pad_num
src.append(src_single)
tgt.append(src_single[1:] + [self.vocab.get(SEP_TOKEN)])
Expand Down Expand Up @@ -954,8 +938,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single[:-1])
tgt.append(src_single[1:])
if ins[1][0] > 0:
Expand Down

0 comments on commit dc155e4

Please sign in to comment.