-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdev.yaml
50 lines (45 loc) · 1.05 KB
/
dev.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
dataset:
- args: [c4, en] # 305 GB
kwargs: {split: train}
weight: 305 # 1:1
# - args: [wikitext, wikitext-103-raw-v1] # 0.5 GB
# kwargs: {split: train}
# weight: 3 # 6:1
# - args: [bookcorpusopen] # 6.3 GB
# kwargs: {split: train}
# weight: 37.8 # 6:1
# - args: [multi_news] # 0.7 GB
# kwargs: {split: train, key: document}
# weight: 4.2 # 6:1
tokenizer:
args: [bert-base-uncased]
kwargs: {}
data:
length: 128
batch_size: 32
shuffle_buffer_size: 25_000
per_dataset_shuffle_buffer_size: 1_000
model:
num_layers: 4
vocabulary_size: 32_768
embedding_dim: 128
model_dim: 256
num_heads: 4
pos_emb_portion: 0.5
hidden_dim: 768
dropout: 0.0
optimizer:
gradient_accumulation_steps: [[0, 1]]
weight_decay: 0.01
lr_min: 0.00006
lr_max: 0.0006
lr_decay_steps: 10_000
lr_warmup_steps: 100
gradient_clip_norm: 1.0
adam_b1: 0.9
adam_b2: 0.995
adam_eps: 0.00000001
mixed_precision:
enable: true
initial_scale_log2: 8
scale_period: 1000