-
Notifications
You must be signed in to change notification settings - Fork 4
/
data_processer.py
126 lines (99 loc) · 4.15 KB
/
data_processer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# @Time : 2023/3/25 18:36
# @Author : tk
import copy
from enum import Enum
import numpy as np
from transformers import PreTrainedTokenizer
class DataStrategy(Enum):
tunction = 1
slidding = 2
def build_template_chatyuan(query, answer = None,prefix=None, history=None):
prompt = prefix or ''
if history is not None:
for q,a in history:
prompt += "用户:{}小元:".format(q,a)
prompt += "用户:{}小元:".format(query)
if answer is not None:
prompt += answer
return prompt
def build_template_default(query, answer = None,prefix=None, history=None):
prompt = prefix or ''
if history is not None:
for q,a in history:
prompt += "User: {}\nAssistant:{}".format(q,a)
prompt += "User: {}\nAssistant:".format(query)
if answer is not None:
prompt += answer
return prompt
def build_template_tiger(query,answer = None,prefix=None, history=None):
prompt = prefix or ''
tok_ins = "\n\n### Instruction:\n"
tok_res = "\n\n### Response:\n"
if history is not None:
for q,a in history:
prompt += "{}{}{}{}".format(tok_ins,q,tok_res,a)
prompt += "{}{}{}".format(tok_ins, query, tok_res)
if answer is not None:
prompt += answer
return prompt
#切换模板
build_template = build_template_chatyuan
class TokenTunction:
@classmethod
def final(cls,a_ids,b_ids,max_seq_length):
seqlen = len(a_ids)
decoder_seqlen = len(b_ids)
attention_mask = [1] * seqlen
decoder_attention_mask = [1] * decoder_seqlen
pad_len = max_seq_length - seqlen
if pad_len > 0:
a_ids += [0] * pad_len
attention_mask += [0] * pad_len
pad_len = max_seq_length - decoder_seqlen
if pad_len > 0:
b_ids += [0] * pad_len
decoder_attention_mask += [0] * pad_len
labels = np.asarray(copy.deepcopy(b_ids[1:]) + [-100], dtype=np.int64)
labels[decoder_seqlen-1:] = -100
d = {
'input_ids': np.asarray(a_ids, dtype=np.int32),
'attention_mask': np.asarray(attention_mask , dtype=np.int32),
'seqlen': np.asarray(seqlen, dtype=np.int32),
'decoder_input_ids': np.asarray(b_ids, dtype=np.int32),
'decoder_attention_mask': np.asarray(decoder_attention_mask, dtype=np.int32),
'decoder_seqlen': np.asarray(decoder_seqlen, dtype=np.int32),
'labels': np.asarray(labels, dtype=np.int64)
}
return d
@classmethod
def process(cls, tokenizer: PreTrainedTokenizer, config, sup, max_seq_length, examples):
ds = []
prefix, examples = examples
for sid, (q, a) in enumerate(examples):
a_ids = tokenizer.encode(text=build_template(q, history=examples[:sid]), add_special_tokens=False)
b_ids = tokenizer.encode(text=a,add_special_tokens=False)
while len(a_ids) > max_seq_length :
a_ids.pop(0)
while len(b_ids) > max_seq_length - 2:
b_ids.pop(-1)
b_ids = [config.decoder_start_token_id] + b_ids + [config.eos_token_id]
ds.append(cls.final(a_ids,b_ids,max_seq_length))
return ds
class TokenSlidding:
@classmethod
def process(cls, tokenizer: PreTrainedTokenizer,config,stride,sup, max_seq_length, examples):
ds = []
prefix,examples = examples
for sid, (q, a) in enumerate(examples):
a_ids = tokenizer.encode(text=build_template(q, history=examples[:sid]), add_special_tokens=False)
b_ids = tokenizer.encode(text=a) + [config.eos_token_id]
input_ids_all = a_ids + b_ids
pos = 0
while pos < len(input_ids_all):
input_ids = [config.bos_token_id] + input_ids_all[pos: pos + max_seq_length - 1]
pos += stride
ds.append({
'input_ids': np.asarray(input_ids,dtype=np.int32),
'seqlen': np.asarray(len(input_ids),dtype=np.int32)
})
return ds