-
Notifications
You must be signed in to change notification settings - Fork 3
/
default_config_gridworld_reward_env.yaml
133 lines (122 loc) · 8.55 KB
/
default_config_gridworld_reward_env.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
env_name: !!str Cliff
device: !!str cpu # torch device (cuda:0 or cpu)
render_env: !!bool False # render environment
agents:
gtn:
mode: !!str 'multi' # 'single': run on single PC / 'multi': run on multiple PCs
max_iterations: !!int 50 # maximum number of GTN iterations
num_threads_per_worker: !!int 1 # how many pytorch/OMP threads per worker
num_workers: !!int 16 # size of the population
noise_std: !!float 1e-1 # standard deviation of the noise vector
step_size: !!float 0.5 # reptile step size
nes_step_size: !!bool False # when set to true, divide step size by the number of workers
mirrored_sampling: !!bool True # use normal or mirrored sampling
num_grad_evals: !!int 1 # how often to evaluate each gradient
grad_eval_type: !!str mean # mean or minmax
weight_decay: !!float 0.0 # weight decay
time_mult: !!float 3 # maximum allocated time as multiplicative of avg. time
time_max: !!float 300 # maximum allocated time for first iteration
time_sleep_master: !!float 0.2 # cycle sleeping time when waiting for data
time_sleep_worker: !!float 2 # cycle sleeping time when waiting for data
score_transform_type: !!int 3 # 0-7
quit_when_solved: !!bool True # continue training once the environment has been solved?
synthetic_env_type: !!int 1 # 0: virtual env / 1: reward env
unsolved_weight: !!float 100 # penalty weight for all solvable environments if they haven't been solved yet
agent_name: !!str QL # which RL agent to use for meta-training
ql:
train_episodes: !!int 100 # maximum number of episodes to optimize
test_episodes: !!int 1 # maximum number of episodes to evaluate
init_episodes: !!int 0 # number of episodes to prevent an early out
batch_size: !!int 1 # batch size when running a policy update step -> keep at 1
alpha: !!float 1 # Q-Learning update factor
gamma: !!float 0.8 # discount factor
eps_init: !!float 0.01 # initial random action percentage
eps_min: !!float 0.01 # final random action percentage
eps_decay: !!float 0.0 # random action decay factor
rb_size: !!int 1 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
beta: !!float 0.005 # beta parameter for count-based exploration, only used if count_based is True
print_rate: 100 # update rate of avg meters
early_out_num: 10 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 2e-2 # performance difference for an early out for virtual envs
sarsa:
train_episodes: !!int 100 # maximum number of episodes to optimize
test_episodes: !!int 1 # maximum number of episodes to evaluate
init_episodes: !!int 0 # number of episodes to prevent an early out
batch_size: !!int 1 # batch size when running a policy update step -> keep at 1
alpha: !!float 1 # Q-Learning update factor
gamma: !!float 0.8 # discount factor
eps_init: !!float 0.1 # initial random action percentage
eps_min: !!float 0.1 # final random action percentage
eps_decay: !!float 0.0 # random action decay factor
rb_size: !!int 1 # size of the replay buffer
same_action_num: !!int 1 # how often to perform the same action subsequently
beta: !!float 0.005 # beta parameter for count-based exploration, only used if count_based is True
print_rate: 100 # update rate of avg meters
early_out_num: 10 # based on how many training episodes shall an early out happen
early_out_virtual_diff: !!float 2e-2 # performance difference for an early out for virtual envs
envs:
EmptyRoom22:
solved_reward: !!float 0.8
max_steps: !!int 10
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
EmptyRoom23:
solved_reward: !!float 0.8
max_steps: !!int 15
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
EmptyRoom33:
solved_reward: !!float 0.8
max_steps: !!int 20
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
WallRoom:
solved_reward: !!float 0.8
max_steps: !!int 30
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
HoleRoom:
solved_reward: !!float 0.8
max_steps: !!int 30
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
HoleRoomLarge:
solved_reward: !!float 0.8
max_steps: !!int 30
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 2 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
HoleRoomLargeShifted:
solved_reward: !!float 0.8
max_steps: !!int 30
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function
Cliff:
solved_reward: !!float -20
max_steps: !!int 50
activation_fn: !!str prelu # activation function of the virtual environment
hidden_size: 32 # size of the hidden layer of the virtual environment
hidden_layer: !!int 1 # number of hidden layers of the virtual environment
info_dim: !!int 0 # additional information dimension from step function
reward_env_type: !!int 2 # type of reward shaping function