diff --git a/finetune/run_c3.py b/finetune/run_c3.py index dd52d04..4e97382 100644 --- a/finetune/run_c3.py +++ b/finetune/run_c3.py @@ -102,10 +102,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset[-1][0].append(src) dataset[-1][2].append(seg) diff --git a/finetune/run_chid.py b/finetune/run_chid.py index 2776ada..37e73b3 100644 --- a/finetune/run_chid.py +++ b/finetune/run_chid.py @@ -109,9 +109,9 @@ def read_dataset(args, data_path, answer_path): src = args.tokenizer.convert_tokens_to_ids(tokens)[: args.seq_length] seg = [0] * len(src) - while len(src) < args.seq_length: - src.append(0) - seg.append(0) + if len(src) < args.seq_length: + src += [0] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset[-1][0].append(src) dataset[-1][2].append(seg) diff --git a/finetune/run_classifier.py b/finetune/run_classifier.py index 79d096f..991c2d3 100644 --- a/finetune/run_classifier.py +++ b/finetune/run_classifier.py @@ -160,10 +160,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) if args.soft_targets and "logits" in columns.keys(): dataset.append((src, tgt, seg, soft_tgt)) else: diff --git a/finetune/run_classifier_deepspeed.py b/finetune/run_classifier_deepspeed.py index 6030de6..43ab9b8 100644 --- a/finetune/run_classifier_deepspeed.py +++ b/finetune/run_classifier_deepspeed.py @@ -51,10 +51,11 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) + if args.soft_targets and "logits" in columns.keys(): dataset[index].append((src, tgt, seg, 0, soft_tgt)) else: diff --git a/finetune/run_classifier_multi_label.py b/finetune/run_classifier_multi_label.py index 01eb67c..1bf9e1b 100644 --- a/finetune/run_classifier_multi_label.py +++ b/finetune/run_classifier_multi_label.py @@ -105,10 +105,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, tgt, seg)) diff --git a/finetune/run_classifier_prompt.py b/finetune/run_classifier_prompt.py index b4f7051..8dc5336 100644 --- a/finetune/run_classifier_prompt.py +++ b/finetune/run_classifier_prompt.py @@ -104,10 +104,10 @@ def read_dataset(args, path): src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) tgt = [0] * len(src) # Ignore the sentence which the answer is not in a sequence if mask_position >= args.seq_length: diff --git a/finetune/run_cmrc.py b/finetune/run_cmrc.py index abdca41..b4cca8b 100644 --- a/finetune/run_cmrc.py +++ b/finetune/run_cmrc.py @@ -116,10 +116,10 @@ def convert_examples_to_dataset(args, examples): src_b = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(span_context) + [SEP_TOKEN]) src = src_a + src_b seg = [1] * len(src_a) + [2] * len(src_b) - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, seg, start_position, end_position, answers, question_id, len(question), doc_span_index, start_offset)) return dataset diff --git a/finetune/run_dbqa.py b/finetune/run_dbqa.py index 4b258b2..611f14e 100644 --- a/finetune/run_dbqa.py +++ b/finetune/run_dbqa.py @@ -41,10 +41,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, tgt, seg, qid)) return dataset diff --git a/finetune/run_ner.py b/finetune/run_ner.py index 935fceb..7042be2 100644 --- a/finetune/run_ner.py +++ b/finetune/run_ner.py @@ -110,11 +110,11 @@ def read_dataset(args, path): src = src[: args.seq_length] tgt = tgt[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - tgt.append(args.labels_num - 1) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + tgt += [args.labels_num - 1] * (args.seq_length - len(tgt)) + seg += [0] * (args.seq_length - len(seg)) dataset.append([src, tgt, seg]) return dataset diff --git a/finetune/run_regression.py b/finetune/run_regression.py index 264c783..e4ac283 100644 --- a/finetune/run_regression.py +++ b/finetune/run_regression.py @@ -73,10 +73,10 @@ def read_dataset(args, path): if len(src) > args.seq_length: src = src[: args.seq_length] seg = seg[: args.seq_length] - PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) + if len(src) < args.seq_length: + PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0] + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) dataset.append((src, tgt, seg)) return dataset diff --git a/finetune/run_text2text.py b/finetune/run_text2text.py index 8712d6e..a7553ec 100755 --- a/finetune/run_text2text.py +++ b/finetune/run_text2text.py @@ -95,13 +95,13 @@ def read_dataset(args, path): tgt_seg = tgt_seg[: args.tgt_seq_length] tgt_out = tgt_in[1:] + [PAD_ID] - while len(src) < args.seq_length: - src.append(PAD_ID) - seg.append(0) - while len(tgt_in) < args.tgt_seq_length: - tgt_in.append(PAD_ID) - tgt_out.append(PAD_ID) - tgt_seg.append(0) + if len(src) < args.seq_length: + src += [PAD_ID] * (args.seq_length - len(src)) + seg += [0] * (args.seq_length - len(seg)) + if len(tgt_in) < args.tgt_seq_length: + tgt_in += [PAD_ID] * (args.tgt_seq_length - len(tgt_in)) + tgt_out += [PAD_ID] * (args.tgt_seq_length - len(tgt_out)) + tgt_seg += [0] * (args.tgt_seq_length - len(tgt_seg)) dataset.append((src, tgt_in, tgt_out, seg, tgt_seg)) diff --git a/tencentpretrain/utils/dataloader.py b/tencentpretrain/utils/dataloader.py index bcac7ae..7ffc45b 100755 --- a/tencentpretrain/utils/dataloader.py +++ b/tencentpretrain/utils/dataloader.py @@ -75,8 +75,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num if len(ins) == 4: src.append(src_single) @@ -125,8 +124,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num if len(ins) == 3: src.append(src_single) @@ -177,8 +175,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single[:-1]) tgt.append(src_single[1:]) seg.append([1] * ins[1][0] + [0] * (len(src_single) - 1 - ins[1][0])) @@ -212,10 +209,9 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] tgt_forward_single, tgt_backward_single = ins[1], ins[2] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - tgt_forward_single.append(self.vocab.get(PAD_TOKEN)) - tgt_backward_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + tgt_forward_single += [self.vocab.get(PAD_TOKEN)] * pad_num + tgt_backward_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single) tgt_forward.append(tgt_forward_single) tgt_backward.append(tgt_backward_single) @@ -247,11 +243,9 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num tgt_single, pad_num = ins[1] - for _ in range(pad_num): - tgt_single.append(self.vocab.get(PAD_TOKEN)) + tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single) tgt_in.append(tgt_single[:-1]) @@ -289,8 +283,7 @@ def __iter__(self): for _, ins in enumerate(instances): src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num if len(ins) == 3: tgt_single = ins[1] @@ -376,11 +369,9 @@ def __iter__(self): for _, ins in enumerate(instances): src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num tgt_single, pad_num = ins[1] - for _ in range(pad_num): - tgt_single.append(self.vocab.get(PAD_TOKEN)) + tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num src_single, _ = mask_seq(src_single, self.tokenizer, self.whole_word_masking, self.span_masking, self.span_geo_prob, self.span_max_length) @@ -442,9 +433,8 @@ def __iter__(self): elif len(seg_pos_single) == 2: seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - seg_single.append(0) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + seg_single += [0] * pad_num src.append(src_single) tgt.append(ins[1]) @@ -474,9 +464,8 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] tgt_single = ins[1] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - tgt_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single) tgt.append(tgt_single) seg.append([1] * ins[2][0] + [2] * (ins[2][1] - ins[2][0]) + [0] * (len(src_single) - ins[2][1])) @@ -515,9 +504,8 @@ def __iter__(self): elif len(seg_pos_single) == 2: seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) - seg_single.append(0) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num + seg_single += [0] * pad_num seg.append(seg_single) if len(ins) == 4 : @@ -643,8 +631,7 @@ def __iter__(self): for ins in instances: src_text_single, pad_num = ins[0] - for _ in range(pad_num): - src_text_single.append(self.vocab.get(PAD_TOKEN)) + src_text_single += [self.vocab.get(PAD_TOKEN)] * pad_num src_text_single, tgt_mlm_single = mask_seq(src_text_single, self.tokenizer, self.whole_word_masking, self.span_masking, self.span_geo_prob, self.span_max_length) src_text.append(src_text_single) masked_words_num += len(tgt_mlm_single) @@ -709,8 +696,7 @@ def __iter__(self): seg_image = [] for ins in instances: src_text_single, pad_num = ins[0] - for _ in range(pad_num): - src_text_single.append(self.vocab.get(PAD_TOKEN)) + src_text_single += [self.vocab.get(PAD_TOKEN)] * pad_num src_text.append(src_text_single) seg_text.append([1] * ins[1][0] + [0] * pad_num) @@ -788,8 +774,7 @@ def __iter__(self): for ins in instances: text_single, pad_num = ins[0] - for _ in range(pad_num): - text_single.append(self.vocab.get(PAD_TOKEN)) + text_single += [self.vocab.get(PAD_TOKEN)] * pad_num waveform, _ = torchaudio.load(ins[2]) # waveform, sample_rate waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers @@ -924,8 +909,7 @@ def __iter__(self): image = self.transform(image) image_tokens = [i + self.vocab_bias for i in image_tokenize(self.vqgan, image)] src_single.extend(image_tokens) - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num seg_single = [1] * ins[1][0] + [2] * len(image_tokens) + [0] * pad_num src.append(src_single) tgt.append(src_single[1:] + [self.vocab.get(SEP_TOKEN)]) @@ -954,8 +938,7 @@ def __iter__(self): for ins in instances: src_single, pad_num = ins[0] - for _ in range(pad_num): - src_single.append(self.vocab.get(PAD_TOKEN)) + src_single += [self.vocab.get(PAD_TOKEN)] * pad_num src.append(src_single[:-1]) tgt.append(src_single[1:]) if ins[1][0] > 0: