-
Notifications
You must be signed in to change notification settings - Fork 3
/
default_config_cartpole_syn_env.yaml
132 lines (124 loc) · 9.43 KB
/
default_config_cartpole_syn_env.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
env_name: !!str CartPole-v0
device: !!str cpu # torch device (cuda:0 or cpu)
render_env: !!bool False # render environment
agents:
gtn:
mode: !!str 'multi' # 'single': run on single PC / 'multi': run on multiple PCs
max_iterations: !!int 200 # maximum number of GTN iterations
num_threads_per_worker: !!int 1 # how many pytorch/OMP threads per worker
num_workers: !!int 16 # size of the population
noise_std: !!float 0.0124 # standard deviation of the noise vector
step_size: !!float 0.148 # reptile step size
nes_step_size: !!bool False # when set to true, divide step size by the number of workers
mirrored_sampling: !!bool True # use normal or mirrored sampling
num_grad_evals: !!int 1 # how often to evaluate each gradient
grad_eval_type: !!str mean # mean or minmax
weight_decay: !!float 0.0 # weight decay
time_mult: !!float 3 # maximum allocated time as multiplicative of avg. time
time_max: !!float 600 # maximum allocated time for first iteration
time_sleep_master: !!float 0.2 # cycle sleeping time when waiting for data
time_sleep_worker: !!float 2 # cycle sleeping time when waiting for data
score_transform_type: !!int 3 # 0-7
quit_when_solved: !!bool False # continue training once the environment has been solved?
synthetic_env_type: !!int 0 # 0: virtual env / 1: reward env
unsolved_weight: !!float 10000 # penalty weight for all solvable environments if they haven't been solved yet
agent_name: !!str DDQN # which RL agent to use for meta-training
ddqn:
train_episodes: !!int 1000 # maximum number of episodes to train
test_episodes: !!int 10 # maximum number of episodes to test
init_episodes: !!int 1 # number of episodes to fill the replay buffer
batch_size: !!int 199 # batch size when running a policy update step
gamma: !!float 0.988 # discount factor
lr: !!float 0.000304 # learning rate
tau: !!float 0.00848 # target network update rate
eps_init: !!float 0.809 # initial random action percentage
eps_min: !!float 0.0371 # final random action percentage
eps_decay: !!float 0.961 # random action decay factor
rb_size: !!int 100000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str tanh # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 57 # size of the actor/critic hidden layer
hidden_layer: !!int 1 # number of hidden layers
print_rate: 10 # update rate of avg meters
early_out_num: 10 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 1e-2 # performance difference for an early out for virtual envs
icm:
lr: !!float 1e-4 # learning rate
beta: !!float 0.2 # weight used to trade off action vs state prediction in the ICM loss function
eta: !!float 0.5 # weight used to compute intrinsic reward from loss value
feature_dim: !!int 32 # feature dimension of features model
hidden_size: !!int 128 # size of all ICM sub-network layers
duelingddqn_vary:
vary_hp: !!bool True # vary hyperparameters of underlying DDQN algorithm?
duelingddqn:
train_episodes: !!int 1000 # maximum number of episodes to train
test_episodes: !!int 10 # maximum number of episodes to test
init_episodes: !!int 1 # number of episodes to fill the replay buffer
batch_size: !!int 193 # batch size when running a policy update step
gamma: !!float 0.961 # discount factor
lr: !!float 0.0091437 # learning rate
tau: !!float 0.073480 # target network update rate
eps_init: !!float 0.906 # initial random action percentage
eps_min: !!float 0.00645 # final random action percentage
eps_decay: !!float 0.8267 # random action decay factor
rb_size: !!int 100000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str tanh # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 61 # size of the actor/critic hidden layer
hidden_layer: !!int 1 # number of hidden layers
feature_dim: !!int 60
print_rate: 1 # update rate of avg meters
early_out_num: 1 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 1e-2 # performance difference for an early out for virtual envs
td3_discrete_vary:
train_episodes: !!int 1000 # maximum number of episodes to optimize
test_episodes: !!int 10 # maximum number of episodes to evaluate
init_episodes: !!int 1 # number of episodes to fill the replay buffer
batch_size: !!int 122 # batch size when running a policy update step
gamma: !!float 0.9989 # discount factor
lr: !!float 0.0017496 # learning rate
tau: !!float 0.0724303 # target network update rate
policy_delay: !!int 1 # frequency of delayed policy updates
rb_size: !!int 1000000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str tanh # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 510 # size of the actor/critic hidden layer
hidden_layer: !!int 2 # number of hidden layers
action_std: !!float 0.037275 # action noise standard deviation
policy_std: !!float 0.2225286 # policy noise standard deviation
policy_std_clip: !!float 0.5 # policy noise standard deviation
print_rate: !!int 1 # update rate of avg meters
early_out_num: !!int 1 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 1e-2 # performance difference for an early out for virtual envs
gumbel_softmax_temp: !!float 2.3076235 # controls the level of soft-discretization of cont. distribution
gumbel_softmax_hard: !!bool True # whether the returned samples should be discretized as one-hot vectors
vary_hp: !!bool False # vary hyperparameters of underlying TD3 algorithm?
td3:
train_episodes: !!int 500 # maximum number of episodes to optimize
test_episodes: !!int 10 # maximum number of episodes to evaluate
init_episodes: !!int 1 # number of episodes to fill the replay buffer
batch_size: !!int 94 # batch size when running a policy update step
gamma: !!float 0.02693 # discount factor
lr: !!float 0.04407 # learning rate
tau: !!float 0.01972 # target network update rate
policy_delay: !!int 2 # frequency of delayed policy updates
rb_size: !!int 1000000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str relu # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 440 # size of the actor/critic hidden layer
hidden_layer: !!int 2 # number of hidden layers
action_std: !!float 2.715 # action noise standard deviation
policy_std: !!float 0.1163 # policy noise standard deviation
policy_std_clip: !!float 0.5 # policy noise standard deviation
print_rate: 1 # update rate of avg meters
early_out_num: !!int 3 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 1e-1 # performance difference for an early out for virtual envs
envs:
CartPole-v0:
solved_reward: !!float 195 # used for early out in RL agent training
max_steps: !!int 200 # maximum number of steps per episode
activation_fn: !!str leakyrelu # activation function of the virtual environment
hidden_size: 83 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 0 # type of reward shaping function