-
Notifications
You must be signed in to change notification settings - Fork 3
/
default_config_acrobot_syn_env.yaml
90 lines (84 loc) · 6.36 KB
/
default_config_acrobot_syn_env.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
env_name: !!str Acrobot-v1
device: !!str cpu # torch device (cuda:0 or cpu)
render_env: !!bool False # render environment
agents:
gtn:
mode: !!str 'multi' # 'single': run on single PC / 'multi': run on multiple PCs
max_iterations: !!int 200 # maximum number of GTN iterations
num_threads_per_worker: !!int 1 # how many pytorch/OMP threads per worker
num_workers: !!int 16 # size of the population
noise_std: !!float 0.0114 # standard deviation of the noise vector
step_size: !!float 0.727 # reptile step size
nes_step_size: !!bool False # when set to true, divide step size by the number of workers
mirrored_sampling: !!bool True # use normal or mirrored sampling
num_grad_evals: !!int 1 # how often to evaluate each gradient
grad_eval_type: !!str mean # mean or minmax
weight_decay: !!float 0.0 # weight decay
time_mult: !!float 3 # maximum allocated time as multiplicative of avg. time
time_max: !!float 600 # maximum allocated time for first iteration
time_sleep_master: !!float 0.2 # cycle sleeping time when waiting for data
time_sleep_worker: !!float 2 # cycle sleeping time when waiting for data
score_transform_type: !!int 3 # 0-7
quit_when_solved: !!bool False # continue training once the environment has been solved?
synthetic_env_type: !!int 0 # 0: virtual env / 1: reward env
unsolved_weight: !!float 10000 # penalty weight for all solvable environments if they haven't been solved yet
agent_name: !!str DDQN # which RL agent to use for meta-training
ddqn_vary:
vary_hp: !!bool True # vary hyperparameters of underlying DDQN algorithm?
ddqn:
train_episodes: !!int 1000 # maximum number of episodes to train
test_episodes: !!int 10 # maximum number of episodes to test
init_episodes: !!int 20 # number of episodes to fill the replay buffer
batch_size: !!int 149 # batch size when running a policy update step
gamma: !!float 0.991 # discount factor
lr: !!float 0.00222 # learning rate
tau: !!float 0.0209 # target network update rate
eps_init: !!float 0.904 # initial random action percentage
eps_min: !!float 0.0471 # final random action percentage
eps_decay: !!float 0.899 # random action decay factor
rb_size: !!int 100000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str leakyrelu # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 112 # size of the actor/critic hidden layer
hidden_layer: !!int 1 # number of hidden layers
print_rate: 1 # update rate of avg meters
early_out_num: 10 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 1e-2 # performance difference for an early out for virtual envs
icm:
lr: !!float 1e-4 # learning rate
beta: !!float 0.2 # weight used to trade off action vs state prediction in the ICM loss function
eta: !!float 0.5 # weight used to compute intrinsic reward from loss value
feature_dim: !!int 32 # feature dimension of features model
hidden_size: !!int 128 # size of all ICM sub-network layers
td3_discrete_vary:
train_episodes: !!int 1000 # maximum number of episodes to optimize
test_episodes: !!int 10 # maximum number of episodes to evaluate
init_episodes: !!int 1 # number of episodes to fill the replay buffer
batch_size: !!int 122 # batch size when running a policy update step
gamma: !!float 0.9989 # discount factor
lr: !!float 0.0017496 # learning rate
tau: !!float 0.0724303 # target network update rate
policy_delay: !!int 1 # frequency of delayed policy updates
rb_size: !!int 1000000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str tanh # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 510 # size of the actor/critic hidden layer
hidden_layer: !!int 2 # number of hidden layers
action_std: !!float 0.037275 # action noise standard deviation
policy_std: !!float 0.2225286 # policy noise standard deviation
policy_std_clip: !!float 0.5 # policy noise standard deviation
print_rate: !!int 1 # update rate of avg meters
early_out_num: !!int 1 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 1e-2 # performance difference for an early out for virtual envs
gumbel_softmax_temp: !!float 2.3076235 # controls the level of soft-discretization of cont. distribution
gumbel_softmax_hard: !!bool True # whether the returned samples should be discretized as one-hot vectors
vary_hp: !!bool False # vary hyperparameters of underlying TD3 algorithm?
envs:
Acrobot-v1:
solved_reward: !!float -100 # used for early out in RL agent training
max_steps: !!int 500 # maximum number of steps per episode
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 167 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 0 # type of reward shaping function