-
Notifications
You must be signed in to change notification settings - Fork 3
/
default_config_pendulum_ppo_opt.yaml
35 lines (33 loc) · 2.39 KB
/
default_config_pendulum_ppo_opt.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
env_name: !!str Pendulum-v0
device: !!str cuda # torch device (cuda:0 or cpu)
render_env: !!bool False # render environment
agents:
ppo:
train_episodes: !!int 10000 # maximum number of episodes to optimize
test_episodes: !!int 10 # maximum number of episodes to evaluate
init_episodes: !!int 0 # number of episodes to fill the replay buffer
update_episodes: !!float 13 # update policy every x episodes (can be float)
ppo_epochs: !!int 170 # update policy for x epochs
gamma: !!float 0.952 # discount factor
lr: !!float 0.000919 # learning rate
vf_coef: !!float 0.204 # value function coefficient (see PPO paper)
ent_coef: !!float 0.00434 # entropy coefficient (see PPO paper)
eps_clip: !!float 0.539 # trust region size (see PPO paper)
rb_size: !!int 1000000 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
activation_fn: !!str tanh # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
hidden_size: !!int 92 # size of the actor/critic hidden layer
hidden_layer: !!int 2 # number of hidden layers
action_std: !!float 1.57 # action noise standard deviation
print_rate: 1 # update rate of avg meters
early_out_num: !!int 50 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 0.0291 # performance difference for an early out for virtual envs
envs:
Pendulum-v0:
solved_reward: !!float -300 # used for early out in RL agent training
max_steps: !!int 200 # maximum number of steps per episode
activation_fn: !!str leakyrelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 2 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function