-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
46 lines (30 loc) · 1.41 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.data import DataLoader
from datasets import Dataset
import pandas as pd
import datasets
new_df = pd.read_csv('train.csv').drop('Unnamed: 0',axis=1)
train_df = new_df[0:25000].sample(frac = 1)
eval_df = new_df[25000:30000].sample(frac = 1)
del new_df
dataset_train = Dataset.from_pandas(train_df)
dataset_eval = Dataset.from_pandas(eval_df)
data_dict_dataset = datasets.DatasetDict({"train": dataset_train ,'eval':dataset_eval})
max_target_length = 3
def preprocess_function(examples):
inputs = [doc for doc in examples["source"]]
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)
#Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples["target"], max_length=max_target_length, truncation=True, padding=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = data_dict_dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["eval"], batch_size=8, collate_fn=data_collator
)