-
Notifications
You must be signed in to change notification settings - Fork 0
/
amharicgpt.py
107 lines (79 loc) · 2.47 KB
/
amharicgpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""amharicgpt.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1gt6Fvf-W2-WA0o4_9fX6VxFMiVFCGpKl
"""
from datasets import load_dataset, DatasetDict
from os import path
dataset_dict = load_dataset('text', data_files='/kaggle/input/amharic/dataset.txt')
dataset = dataset_dict['train']
train_test = dataset.train_test_split(0.2)
dataset = train_test
dataset
dataset
from transformers import AutoTokenizer
context_length = 1024
tokenizer = AutoTokenizer.from_pretrained("dagim/amharic_tokenizer")
def tokenize(element):
return tokenizer(
element["text"],
truncation=True,
padding=True,
max_length=context_length,
return_overflowing_tokens=True
)
tokenized_datasets = dataset.map(
tokenize, batched=True, remove_columns=dataset['train'].column_names
)
tokenized_datasets
len(tokenizer)
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
config = AutoConfig.from_pretrained(
# change the model size here.
"gpt2-medium",
vocab_size=len(tokenizer),
n_ctx=context_length,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = '[PAD]'
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
# tokenizer.add_special_tokens({'eos_token': '[PAD]'})
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
print(f"{key} shape: {out[key].shape}")
from huggingface_hub import notebook_login
notebook_login()
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
output_dir="AmharicGPT-Medium",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
evaluation_strategy="steps",
eval_steps=5_000,
logging_steps=5_000,
gradient_accumulation_steps=8,
num_train_epochs=1,
weight_decay=0.1,
warmup_steps=1_000,
lr_scheduler_type="cosine",
learning_rate=5e-4,
save_steps=5_000,
fp16=True,
push_to_hub=True,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
)
trainer.train()
trainer.push_to_hub()