vwxyzjn · vwxyzjn · Aug 26, 2022 · Jul 10, 2022 · Jul 10, 2022 · Jul 10, 2022
diff --git a/cleanrl_utils/tuner.py b/cleanrl_utils/tuner.py
@@ -2,7 +2,7 @@
 import runpy
 import sys
 import time
-from typing import Callable
+from typing import Callable, Dict, List, Optional
 
 import numpy as np
 import optuna
@@ -26,22 +26,42 @@ def __init__(
         self,
         script: str,
         metric: str,
-        target_scores: dict[str, list[int]],
-        params_fn: Callable[[optuna.Trial], dict],
+        target_scores: Dict[str, Optional[List[float]]],
+        params_fn: Callable[[optuna.Trial], Dict],
         direction: str = "maximize",
+        aggregation_type: str = "average",
         metric_last_n_average_window: int = 50,
-        pruner: optuna.pruners.BasePruner = None,
+        sampler: Optional[optuna.samplers.BaseSampler] = None,
+        pruner: Optional[optuna.pruners.BasePruner] = None,
         storage: str = "sqlite:///cleanrl_hpopt.db",
         study_name: str = "",
-        wandb_kwargs: dict[str, any] = {},
+        wandb_kwargs: Dict[str, any] = {},
     ) -> None:
         self.script = script
         self.metric = metric
         self.target_scores = target_scores
+        if len(self.target_scores) > 1:
+            if self.target_scores.values()[0] is None:
+                raise ValueError(
+                    "If there are multiple environments, the target scores must be specified for each environment."
+                )
+
         self.params_fn = params_fn
         self.direction = direction
+        self.aggregation_type = aggregation_type
+        if self.aggregation_type == "average":
+            self.aggregation_fn = np.average
+        elif self.aggregation_type == "median":
+            self.aggregation_fn = np.median
+        elif self.aggregation_type == "max":
+            self.aggregation_fn = np.max
+        elif self.aggregation_type == "min":
+            self.aggregation_fn = np.min
+        else:
+            raise ValueError(f"Unknown aggregation type {self.aggregation_type}")
         self.metric_last_n_average_window = metric_last_n_average_window
         self.pruner = pruner
+        self.sampler = sampler
         self.storage = storage
         self.study_name = study_name
         if len(self.study_name) == 0:
@@ -63,8 +83,9 @@ def objective(trial: optuna.Trial):
                 )
 
             algo_command = [f"--{key}={value}" for key, value in params.items()]
-            normalized_scores = []
+            normalized_scoress = []
             for seed in range(num_seeds):
+                normalized_scores = []
                 for env_id in self.target_scores.keys():
                     sys.argv = algo_command + [f"--env-id={env_id}", f"--seed={seed}", "--track=False"]
                     with HiddenPrints():
@@ -81,28 +102,39 @@ def objective(trial: optuna.Trial):
                     print(
                         f"The average episodic return on {env_id} is {np.average(metric_values)} averaged over the last {self.metric_last_n_average_window} episodes."
                     )
-                    normalized_scores += [
-                        (np.average(metric_values) - self.target_scores[env_id][0])
-                        / (self.target_scores[env_id][1] - self.target_scores[env_id][0])
-                    ]
+                    if self.target_scores[env_id] is not None:
+                        normalized_scores += [
+                            (np.average(metric_values) - self.target_scores[env_id][0])
+                            / (self.target_scores[env_id][1] - self.target_scores[env_id][0])
+                        ]
+                    else:
+                        normalized_scores += [np.average(metric_values)]
                     if run:
                         run.log({f"{env_id}_return": np.average(metric_values)})
-                print(f"The normalized score is {np.average(normalized_scores)} with num_seeds={seed}")
-                trial.report(np.average(normalized_scores), step=seed)
+
+                normalized_scoress += [normalized_scores]
+                aggregated_normalized_score = self.aggregation_fn(normalized_scores)
+                print(f"The {self.aggregation_type} normalized score is {aggregated_normalized_score} with num_seeds={seed}")
+                trial.report(aggregated_normalized_score, step=seed)
+                if run:
+                    run.log({"aggregated_normalized_score": aggregated_normalized_score})
                 if trial.should_prune():
-                    run.finish(quiet=True)
+                    if run:
+                        run.finish(quiet=True)
                     raise optuna.TrialPruned()
-                if run:
-                    run.log({"normalized_scores": np.average(normalized_scores)})
 
-            run.finish(quiet=True)
-            return np.average(normalized_scores)
+            if run:
+                run.finish(quiet=True)
+            return np.average(
+                self.aggregation_fn(normalized_scoress, axis=1)
+            )  # we alaways return the average of the aggregated normalized scores
 
         study = optuna.create_study(
             study_name=self.study_name,
             direction=self.direction,
             storage=self.storage,
             pruner=self.pruner,
+            sampler=self.sampler,
         )
         print("==========================================================================================")
         print("run another tuner with the following command:")

diff --git a/docs/advanced/hyperparameter-tuning.md b/docs/advanced/hyperparameter-tuning.md
@@ -13,8 +13,9 @@ tuner = Tuner(
     metric="charts/episodic_return",
     metric_last_n_average_window=50,
     direction="maximize",
+    aggregation_type="average",
     target_scores={
-        "CartPole-v1": [0, 500],
+        "CartPole-v1": None,
     },
     params_fn=lambda trial: {
         "learning-rate": trial.suggest_loguniform("learning-rate", 0.0003, 0.003),
@@ -36,16 +37,16 @@ tuner.tune(
 Then you can run the tuner with 
 
 ```bash
-poetry install optuna
+poetry install -E optuna
 python tuner_example.py
 ```
 
 Here is what happened:
 
 1. The `tuner_example.py` launches `num_trials=100` *trials* to find the best single set of hyperparameters for `CartPole-v1` in `script="cleanrl/ppo.py"`.
-1. Each *trial* uses a set of hyperparameters sampled from the `params_fn` to run `num_seeds=3` *experiments* with different random seeds, mitigating the impact of randomness on the results. 
-    * In each *experiment*, `tuner_example.py` averages the last `metric_last_n_average_window=50` reported  `metric="charts/episodic_return"` to a number $x_i$ and calculate a normalized score  $z_i = (x_i - 0) / (500 - 0)$ according to `target_scores`, where $0$ and $500$ are the minimum and maximum values of the `metric` for `CartPole-v1`.
-1. Each *trial* then averages the normalized scores $z_i$ of the three *experiments* to a number $z$ and the tuner optimizes $z$ according `direction="maximize"`.
+1. Each *trial* samples a set of hyperparameters from the `params_fn` to run `num_seeds=3` *experiments* with different random seeds, mitigating the impact of randomness on the results. 
+    * In each *experiment*, `tuner_example.py` averages the last `metric_last_n_average_window=50` reported `metric="charts/episodic_return"` to a number $x_i$ and calculate a normalized score $z_i$ according to the `target_scores`. We will talk more about `target_scores` which is useful when working with multiple environments in the next section, but in this section we just set `None` as the target score for `CartPole-v1` — this means we will set the normalized score $z_i = x_i$.
+1. Each *trial* then averages the normalized scores $z_i$ of the `num_seeds=3` *experiments* to a number $z$ and the tuner optimizes $z$ according `direction="maximize"`.
 
 
 ## Visualization
@@ -59,18 +60,23 @@ poetry run optuna-dashboard sqlite:///cleanrl_hpopt.db
 You can use a different database by passing `Tuner(..., storage="mysql://root@localhost/example")`, for example.
 
 
+
+
 ## Work w/ multiple environments
 
-`Tuner` supports finding a set of hyper parameters of that works well against multiple environments by extending `target_scores`. In the following example, each trial uses a set of hyperparameters to run experiments with 3 random seeds for each environment in `["CartPole-v1","Acrobot-v1"]`, totalling `2*3=6` experiments per trial.
+Often it is useful to find a single set of hyper parameters that work well with multiple environments, but this is challenging **because each environment may have different reward scales**, so we need to do normalization. 
+
+This is where `target_scores` comes in. We can use it to specify an upper and lower threshold of rewards for each environment (they don't have to be exact boundaries and can be just ballpark estimates). For example, if we want to find a set of hyper parameters that work well with `CartPole-v1` and `Acrobot-v1`, we can set the `target_scores` and `aggregation_type` as follows:
 
 
-```python title="tuner_example_multi_env.py" hl_lines="8 9"
+```python title="tuner_example_multi_env.py" hl_lines="7-11"
 from cleanrl_utils.tuner import Tuner
 tuner = Tuner(
     script="cleanrl/ppo.py",
     metric="charts/episodic_return",
     metric_last_n_average_window=50,
     direction="maximize",
+    aggregation_type="average",
     target_scores={
         "CartPole-v1": [0, 500],
         "Acrobot-v1": [-500, 0],
@@ -92,17 +98,23 @@ tuner.tune(
 )
 ```
 
+Then each experiment will calculate the normalized scores $z_{i_0} = (x_{i_0} - 0) / (500 - 0), z_{i_1} = (x_{i_1} - -500) / (0 - -500)$ and the tuner will `aggregation_type="average"` the two scores to get the normalized score $z_i = (z_{i_0} + z_{i_1}) / 2$.
+
+Note that we are using 3 random seeds for each environment in `["CartPole-v1","Acrobot-v1"]`, totalling `2*3=6` experiments per trial.
+
+
 
 ???+ info
 
-    When optimizing Atari games, you can use `target_scores` as the human normalized scores in (Mnih et al., 2015, Extended Data Table 2)[^1], such as 
+    When optimizing Atari games, you can put the human normalized scores in `target_scores` (Mnih et al., 2015, Extended Data Table 2)[^1], as done in the following example. The first number for each environment is the score obtained by random play and the second number is the score obtained by professional game testers.
 
     ```python
     tuner = Tuner(
         script="cleanrl/ppo_atari.py",
         metric="charts/episodic_return",
         metric_last_n_average_window=50,
         direction="maximize",
+        aggregation_type="average",
         target_scores={
             "Alien-v5": [227.8, 6875],
             "Amidar-v5": [5.8, 1676],
@@ -120,14 +132,15 @@ tuner.tune(
 
 You can use `Tuner` with any [pruner](https://optuna.readthedocs.io/en/stable/reference/pruners.html) from `optuna` to prune less promising experiments or [samplers](https://optuna.readthedocs.io/en/stable/reference/samplers.html) to sample new hyperparameters. If you don't specify them explicitly, the script will use the [default ones](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.create_study.html).
 
-```python title="tuner_example_pruner.py" hl_lines="1 22 23"
+```python title="tuner_example_pruner.py" hl_lines="1 23 24"
 import optuna
 from cleanrl_utils.tuner import Tuner
 tuner = Tuner(
     script="cleanrl/ppo.py",
     metric="charts/episodic_return",
     metric_last_n_average_window=50,
     direction="maximize",
+    aggregation_type="average",
     target_scores={
         "CartPole-v1": [0, 500],
         "Acrobot-v1": [-500, 0],
@@ -157,14 +170,15 @@ tuner.tune(
 The `Tuner` can track all the experiments into [Weights and Biases](https://wandb.ai) to help you visualize the progress of the tuning.
 
 
-```python title="tuner_example.py" hl_lines="24"
+```python title="tuner_example.py" hl_lines="25"
 import optuna
 from cleanrl_utils.tuner import Tuner
 tuner = Tuner(
     script="cleanrl/ppo.py",
     metric="charts/episodic_return",
     metric_last_n_average_window=50,
     direction="maximize",
+    aggregation_type="average",
     target_scores={
         "CartPole-v1": [0, 500],
         "Acrobot-v1": [-500, 0],

diff --git a/docs/advanced/optuna-dashboard.png b/docs/advanced/optuna-dashboard.png
diff --git a/tuner_example.py b/tuner_example.py
@@ -22,7 +22,8 @@
         "num-envs": 16,
     },
     pruner=optuna.pruners.MedianPruner(n_startup_trials=5),
-    wandb_kwargs={"project": "cleanrl"},
+    sampler=optuna.samplers.TPESampler(),
+    # wandb_kwargs={"project": "cleanrl"},
 )
 tuner.tune(
     num_trials=10,