default_config_acrobot_syn_env.yaml

env_name: !!str Acrobot-v1
device: !!str cpu                           # torch device (cuda:0 or cpu)
render_env: !!bool False                    # render environment

agents:
  gtn:
    mode: !!str 'multi'                     # 'single': run on single PC / 'multi': run on multiple PCs
    max_iterations: !!int 200               # maximum number of GTN iterations
    num_threads_per_worker: !!int 1         # how many pytorch/OMP threads per worker
    num_workers: !!int 16                   # size of the population
    noise_std: !!float 0.0114               # standard deviation of the noise vector
    step_size: !!float 0.727                # reptile step size
    nes_step_size: !!bool False             # when set to true, divide step size by the number of workers
    mirrored_sampling: !!bool True          # use normal or mirrored sampling
    num_grad_evals: !!int 1                 # how often to evaluate each gradient
    grad_eval_type: !!str mean              # mean or minmax
    weight_decay: !!float 0.0               # weight decay
    time_mult: !!float 3                    # maximum allocated time as multiplicative of avg. time
    time_max: !!float 600                   # maximum allocated time for first iteration
    time_sleep_master: !!float 0.2          # cycle sleeping time when waiting for data
    time_sleep_worker: !!float 2            # cycle sleeping time when waiting for data
    score_transform_type: !!int 3           # 0-7
    quit_when_solved: !!bool False          # continue training once the environment has been solved?
    synthetic_env_type: !!int 0             # 0: virtual env / 1: reward env
    unsolved_weight: !!float 10000          # penalty weight for all solvable environments if they haven't been solved yet
    agent_name: !!str DDQN                  # which RL agent to use for meta-training

  ddqn_vary:
    vary_hp: !!bool True                    # vary hyperparameters of underlying DDQN algorithm?

  ddqn:
    train_episodes: !!int 1000              # maximum number of episodes to train
    test_episodes: !!int 10                 # maximum number of episodes to test
    init_episodes: !!int 20                 # number of episodes to fill the replay buffer
    batch_size: !!int 149                   # batch size when running a policy update step
    gamma: !!float 0.991                    # discount factor
    lr: !!float 0.00222                     # learning rate
    tau: !!float 0.0209                     # target network update rate
    eps_init: !!float 0.904                 # initial random action percentage
    eps_min: !!float 0.0471                 # final random action percentage
    eps_decay: !!float 0.899                # random action decay factor
    rb_size: !!int 100000                   # size of the replay buffer
    same_action_num: !!int 1                # how often to perform the same action subsequently
    activation_fn: !!str leakyrelu          # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
    hidden_size: !!int 112                  # size of the actor/critic hidden layer
    hidden_layer: !!int 1                   # number of hidden layers
    print_rate: 1                           # update rate of avg meters
    early_out_num: 10                       # based on how many training episodes shall an early out happen
    early_out_virtual_diff: !!float 1e-2    # performance difference for an early out for virtual envs

  icm:
    lr: !!float 1e-4                       # learning rate
    beta: !!float 0.2                       # weight used to trade off action vs state prediction in the ICM loss function
    eta: !!float 0.5                        # weight used to compute intrinsic reward from loss value
    feature_dim: !!int 32                   # feature dimension of features model
    hidden_size: !!int 128                  # size of all ICM sub-network layers

  td3_discrete_vary:
    train_episodes: !!int 1000              # maximum number of episodes to optimize
    test_episodes: !!int 10                 # maximum number of episodes to evaluate
    init_episodes: !!int 1                 # number of episodes to fill the replay buffer
    batch_size: !!int 122                  # batch size when running a policy update step
    gamma: !!float 0.9989                   # discount factor
    lr: !!float 0.0017496                   # learning rate
    tau: !!float 0.0724303                  # target network update rate
    policy_delay: !!int 1                   # frequency of delayed policy updates
    rb_size: !!int 1000000                  # size of the replay buffer
    same_action_num: !!int 1                # how often to perform the same action subsequently
    activation_fn: !!str tanh           # activation function for actor/critic ('tanh', 'relu', 'leakyrelu' or 'prelu')
    hidden_size: !!int 510                  # size of the actor/critic hidden layer
    hidden_layer: !!int 2                   # number of hidden layers
    action_std: !!float 0.037275            # action noise standard deviation
    policy_std: !!float 0.2225286           # policy noise standard deviation
    policy_std_clip: !!float 0.5            # policy noise standard deviation
    print_rate: !!int 1                    # update rate of avg meters
    early_out_num: !!int 1                 # based on how many training episodes shall an early out happen
    early_out_virtual_diff: !!float 1e-2    # performance difference for an early out for virtual envs
    gumbel_softmax_temp: !!float 2.3076235  # controls the level of soft-discretization of cont. distribution
    gumbel_softmax_hard: !!bool True       # whether the returned samples should be discretized as one-hot vectors
    vary_hp: !!bool False                   # vary hyperparameters of underlying TD3 algorithm?

envs:
  Acrobot-v1:
    solved_reward: !!float -100             # used for early out in RL agent training
    max_steps: !!int 500                    # maximum number of steps per episode
    activation_fn: !!str prelu              # activation function of the virtual environment
    hidden_size: 167                        # size of the hidden layer of the virtual environment
    hidden_layer: !!int 1                   # number of hidden layers of the virtual environment
    info_dim: !!int 0                       # additional information dimension from step function
    reward_env_type: !!int 0                # type of reward shaping function