-
Notifications
You must be signed in to change notification settings - Fork 40
/
rnn.yml
61 lines (52 loc) · 1.57 KB
/
rnn.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
seed: 1
cuda: 0 # use_gpu
env:
env_type: meta
env_name: PointRobotSparse-v0
max_rollouts_per_task: 2 # k. total steps: 120
num_tasks: 100
num_train_tasks: 80
num_eval_tasks: 20
train:
# sample complexity: BAMDP horizon * (num_init_rollouts_pool * num_train_tasks
# + num_iters * num_tasks_sample * num_rollouts_per_iter)
# original 4k iters -> 12M steps
# 1.5k iters -> 4.55M steps (17 gpu hrs, 31 cpu hrs)
# original rl training steps: num_iters * updates_per_iter = 1.5M
# now makes it same as env steps
num_iters: 500 # 38000 # number meta-training iterates
num_init_rollouts_pool: 400 # 80 # before training
num_rollouts_per_iter: 25 # 1 #
buffer_size: 4e6 # or 1e6? old belief
num_updates_per_iter: 1000
batch_size: 32 # to tune based on sampled_seq_len
sampled_seq_len: -1 # -1 is all, or positive integer
sample_weight_baseline: 0.0
eval:
eval_stochastic: false # also eval stochastic policy
log_interval: 5 # 10 num of iters
save_interval: 100 # -1
log_tensorboard: true
policy:
separate: True
seq_model: gru # [lstm, gru]
algo_name: sac # [td3, sac]
action_embedding_size: 0 # no action input
observ_embedding_size: 32
reward_embedding_size: 8
rnn_hidden_size: 128
dqn_layers: [128, 128]
policy_layers: [128, 128]
lr: 0.0003
gamma: 0.9
tau: 0.005
sac:
entropy_alpha: 0.2
automatic_entropy_tuning: true
alpha_lr: 0.0003
td3:
## since we normalize action space to [-1, 1]
## the noise std is absolute value
exploration_noise: 0.1
target_noise: 0.2
target_noise_clip: 0.5