Unity-Technologies · miguelalonsojr · Sep 30, 2021 · Sep 9, 2021 · Sep 16, 2021 · Sep 16, 2021
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -41,6 +41,7 @@ terminated teammates. (#5441)
 episode. (#5375)
 
 #### ml-agents / ml-agents-envs / gym-unity (Python)
+- Added support for having `beta`, `epsilon`, and `learning rate` on separate schedules (affects only PPO and POCA). (#5538)
 - The calculation of the target entropy of SAC with continuous actions was incorrect and has been fixed. (#5372)
 - Fixed an issue where the histogram stats would not be reported correctly in TensorBoard. (#5410)
 - Fixed error when importing models which use the ResNet encoder. (#5358)

diff --git a/config/imitation/Crawler.yaml b/config/imitation/Crawler.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 512

diff --git a/config/imitation/Hallway.yaml b/config/imitation/Hallway.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 128

diff --git a/config/imitation/PushBlock.yaml b/config/imitation/PushBlock.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/poca/DungeonEscape.yaml b/config/poca/DungeonEscape.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: constant
+      beta_schedule: constant
+      epsilon_schedule: constant
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/poca/PushBlockCollab.yaml b/config/poca/PushBlockCollab.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: constant
+      beta_schedule: constant
+      epsilon_schedule: constant
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/poca/SoccerTwos.yaml b/config/poca/SoccerTwos.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: constant
+      beta_schedule: constant
+      epsilon_schedule: constant
     network_settings:
       normalize: false
       hidden_units: 512

diff --git a/config/poca/StrikersVsGoalie.yaml b/config/poca/StrikersVsGoalie.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: constant
+      beta_schedule: constant
+      epsilon_schedule: constant
     network_settings:
       normalize: false
       hidden_units: 512
@@ -41,6 +43,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: constant
+      beta_schedule: constant
+      epsilon_schedule: constant
     network_settings:
       normalize: false
       hidden_units: 512

diff --git a/config/ppo/3DBall.yaml b/config/ppo/3DBall.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 128

diff --git a/config/ppo/3DBallHard.yaml b/config/ppo/3DBallHard.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 128

diff --git a/config/ppo/3DBall_randomize.yaml b/config/ppo/3DBall_randomize.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 128

diff --git a/config/ppo/Basic.yaml b/config/ppo/Basic.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 20

diff --git a/config/ppo/Crawler.yaml b/config/ppo/Crawler.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 512

diff --git a/config/ppo/FoodCollector.yaml b/config/ppo/FoodCollector.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/ppo/GridWorld.yaml b/config/ppo/GridWorld.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 128

diff --git a/config/ppo/Hallway.yaml b/config/ppo/Hallway.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 128

diff --git a/config/ppo/Match3.yaml b/config/ppo/Match3.yaml
@@ -9,6 +9,8 @@ default_settings:
     lambd: 0.99
     num_epoch: 3
     learning_rate_schedule: constant
+    beta_schedule: constant
+    epsilon_schedule: constant
   network_settings:
     normalize: true
     hidden_units: 256

diff --git a/config/ppo/PushBlock.yaml b/config/ppo/PushBlock.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/ppo/Pyramids.yaml b/config/ppo/Pyramids.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 512

diff --git a/config/ppo/PyramidsRND.yaml b/config/ppo/PyramidsRND.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 512

diff --git a/config/ppo/Sorter_curriculum.yaml b/config/ppo/Sorter_curriculum.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: constant
+      beta_schedule: constant
+      epsilon_schedule: constant
     network_settings:
       normalize: False
       hidden_units: 128

diff --git a/config/ppo/Visual3DBall.yaml b/config/ppo/Visual3DBall.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 128

diff --git a/config/ppo/VisualFoodCollector.yaml b/config/ppo/VisualFoodCollector.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 128

diff --git a/config/ppo/Walker.yaml b/config/ppo/Walker.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 512

diff --git a/config/ppo/WallJump.yaml b/config/ppo/WallJump.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256
@@ -34,6 +36,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/ppo/WallJump_curriculum.yaml b/config/ppo/WallJump_curriculum.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256
@@ -34,6 +36,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 256

diff --git a/config/ppo/Worm.yaml b/config/ppo/Worm.yaml
@@ -10,6 +10,8 @@ behaviors:
       lambd: 0.95
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: linear
+      epsilon_schedule: linear
     network_settings:
       normalize: true
       hidden_units: 512

diff --git a/docs/Learning-Environment-Create-New.md b/docs/Learning-Environment-Create-New.md
@@ -417,6 +417,8 @@ behaviors:
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: linear
+      beta_schedule: constant
+      epsilon_schedule: linear
     network_settings:
       normalize: false
       hidden_units: 128

diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
@@ -38,7 +38,7 @@ choice of the trainer (which we review on subsequent sections).
 | `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | `hyperparameters -> batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using continuous actions, this value should be large (on the order of 1000s). If you are using only discrete actions, this value should be smaller (on the order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
 | `hyperparameters -> buffer_size`            | (default = `10240` for PPO and `50000` for SAC)<br> **PPO:** Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. <br> **SAC:** The max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
-| `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
+| `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |                                                                                                           |
 | `network_settings -> hidden_units`           | (default = `128`) Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
 | `network_settings -> num_layers`             | (default = `2`) The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
 | `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
@@ -59,6 +59,8 @@ the `trainer` setting above).
 | :---------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `hyperparameters -> beta`      | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
 | `hyperparameters -> epsilon`   | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
+| `hyperparameters -> beta_schedule` | (default = `linear`) Determines how beta changes over time. <br><br>`linear` decays beta linearly, reaching 0 at max_steps, while `constant` keeps beta constant for the entire training run.                                                                                                           |
+| `hyperparameters -> epsilon_schedule` | (default = `linear`) Determines how epsilon changes over time (PPO only). <br><br>`linear` decays epsilon linearly, reaching 0 at max_steps, while `constant` keeps the epsilon constant for the entire training run.
 | `hyperparameters -> lambd`     | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
 | `hyperparameters -> num_epoch` | (default = `3`) Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
 

diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
@@ -269,7 +269,9 @@ behaviors:
       # PPO-specific hyperparameters
       # Replaces the "PPO-specific hyperparameters" section above
       beta: 5.0e-3
+      beta_schedule: constant
       epsilon: 0.2
+      epsilon_schedule: linear
       lambd: 0.95
       num_epoch: 3
 

diff --git a/ml-agents/mlagents/trainers/poca/optimizer_torch.py b/ml-agents/mlagents/trainers/poca/optimizer_torch.py
@@ -172,13 +172,13 @@ def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
             self.trainer_settings.max_steps,
         )
         self.decay_epsilon = ModelUtils.DecayedValue(
-            self.hyperparameters.learning_rate_schedule,
+            self.hyperparameters.epsilon_schedule,
             self.hyperparameters.epsilon,
             0.1,
             self.trainer_settings.max_steps,
         )
         self.decay_beta = ModelUtils.DecayedValue(
-            self.hyperparameters.learning_rate_schedule,
+            self.hyperparameters.beta_schedule,
             self.hyperparameters.beta,
             1e-5,
             self.trainer_settings.max_steps,

diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
@@ -50,13 +50,13 @@ def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
             self.trainer_settings.max_steps,
         )
         self.decay_epsilon = ModelUtils.DecayedValue(
-            self.hyperparameters.learning_rate_schedule,
+            self.hyperparameters.epsilon_schedule,
             self.hyperparameters.epsilon,
             0.1,
             self.trainer_settings.max_steps,
         )
         self.decay_beta = ModelUtils.DecayedValue(
-            self.hyperparameters.learning_rate_schedule,
+            self.hyperparameters.beta_schedule,
             self.hyperparameters.beta,
             1e-5,
             self.trainer_settings.max_steps,

diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
@@ -91,6 +91,8 @@ class EncoderType(Enum):
 class ScheduleType(Enum):
     CONSTANT = "constant"
     LINEAR = "linear"
+    # TODO add support for lesson based scheduling
+    # LESSON = "lesson"
 
 
 class ConditioningType(Enum):
@@ -151,6 +153,8 @@ class PPOSettings(HyperparamSettings):
     lambd: float = 0.95
     num_epoch: int = 3
     learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
+    beta_schedule: ScheduleType = ScheduleType.LINEAR
+    epsilon_schedule: ScheduleType = ScheduleType.LINEAR
 
 
 @attr.s(auto_attribs=True)