Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hyperparameter optimization #228

Merged
merged 22 commits into from
Aug 26, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 49 additions & 17 deletions cleanrl_utils/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import runpy
import sys
import time
from typing import Callable
from typing import Callable, Dict, List, Optional

import numpy as np
import optuna
Expand All @@ -26,22 +26,42 @@ def __init__(
self,
script: str,
metric: str,
target_scores: dict[str, list[int]],
params_fn: Callable[[optuna.Trial], dict],
target_scores: Dict[str, Optional[List[float]]],
params_fn: Callable[[optuna.Trial], Dict],
direction: str = "maximize",
aggregation_type: str = "average",
metric_last_n_average_window: int = 50,
pruner: optuna.pruners.BasePruner = None,
sampler: Optional[optuna.samplers.BaseSampler] = None,
pruner: Optional[optuna.pruners.BasePruner] = None,
storage: str = "sqlite:///cleanrl_hpopt.db",
study_name: str = "",
wandb_kwargs: dict[str, any] = {},
wandb_kwargs: Dict[str, any] = {},
) -> None:
self.script = script
self.metric = metric
self.target_scores = target_scores
if len(self.target_scores) > 1:
if self.target_scores.values()[0] is None:
vwxyzjn marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
"If there are multiple environments, the target scores must be specified for each environment."
)

self.params_fn = params_fn
self.direction = direction
self.aggregation_type = aggregation_type
if self.aggregation_type == "average":
self.aggregation_fn = np.average
elif self.aggregation_type == "median":
self.aggregation_fn = np.median
elif self.aggregation_type == "max":
self.aggregation_fn = np.max
elif self.aggregation_type == "min":
self.aggregation_fn = np.min
else:
raise ValueError(f"Unknown aggregation type {self.aggregation_type}")
self.metric_last_n_average_window = metric_last_n_average_window
self.pruner = pruner
self.sampler = sampler
self.storage = storage
self.study_name = study_name
if len(self.study_name) == 0:
Expand All @@ -63,8 +83,9 @@ def objective(trial: optuna.Trial):
)

algo_command = [f"--{key}={value}" for key, value in params.items()]
normalized_scores = []
normalized_scoress = []
for seed in range(num_seeds):
normalized_scores = []
for env_id in self.target_scores.keys():
sys.argv = algo_command + [f"--env-id={env_id}", f"--seed={seed}", "--track=False"]
with HiddenPrints():
Expand All @@ -81,28 +102,39 @@ def objective(trial: optuna.Trial):
print(
f"The average episodic return on {env_id} is {np.average(metric_values)} averaged over the last {self.metric_last_n_average_window} episodes."
)
normalized_scores += [
(np.average(metric_values) - self.target_scores[env_id][0])
/ (self.target_scores[env_id][1] - self.target_scores[env_id][0])
]
if self.target_scores[env_id] is not None:
normalized_scores += [
(np.average(metric_values) - self.target_scores[env_id][0])
/ (self.target_scores[env_id][1] - self.target_scores[env_id][0])
]
else:
normalized_scores += [np.average(metric_values)]
if run:
run.log({f"{env_id}_return": np.average(metric_values)})
print(f"The normalized score is {np.average(normalized_scores)} with num_seeds={seed}")
trial.report(np.average(normalized_scores), step=seed)

normalized_scoress += [normalized_scores]
aggregated_normalized_score = self.aggregation_fn(normalized_scores)
print(f"The {self.aggregation_type} normalized score is {aggregated_normalized_score} with num_seeds={seed}")
trial.report(aggregated_normalized_score, step=seed)
if run:
run.log({"aggregated_normalized_score": aggregated_normalized_score})
if trial.should_prune():
run.finish(quiet=True)
if run:
run.finish(quiet=True)
raise optuna.TrialPruned()
if run:
run.log({"normalized_scores": np.average(normalized_scores)})

run.finish(quiet=True)
return np.average(normalized_scores)
if run:
run.finish(quiet=True)
return np.average(
self.aggregation_fn(normalized_scoress, axis=1)
) # we alaways return the average of the aggregated normalized scores

study = optuna.create_study(
study_name=self.study_name,
direction=self.direction,
storage=self.storage,
pruner=self.pruner,
sampler=self.sampler,
)
print("==========================================================================================")
print("run another tuner with the following command:")
Expand Down
34 changes: 24 additions & 10 deletions docs/advanced/hyperparameter-tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ tuner = Tuner(
metric="charts/episodic_return",
metric_last_n_average_window=50,
direction="maximize",
aggregation_type="average",
target_scores={
"CartPole-v1": [0, 500],
"CartPole-v1": None,
},
params_fn=lambda trial: {
"learning-rate": trial.suggest_loguniform("learning-rate", 0.0003, 0.003),
Expand All @@ -36,16 +37,16 @@ tuner.tune(
Then you can run the tuner with

```bash
poetry install optuna
poetry install -E optuna
python tuner_example.py
```

Here is what happened:

1. The `tuner_example.py` launches `num_trials=100` *trials* to find the best single set of hyperparameters for `CartPole-v1` in `script="cleanrl/ppo.py"`.
1. Each *trial* uses a set of hyperparameters sampled from the `params_fn` to run `num_seeds=3` *experiments* with different random seeds, mitigating the impact of randomness on the results.
* In each *experiment*, `tuner_example.py` averages the last `metric_last_n_average_window=50` reported `metric="charts/episodic_return"` to a number $x_i$ and calculate a normalized score $z_i = (x_i - 0) / (500 - 0)$ according to `target_scores`, where $0$ and $500$ are the minimum and maximum values of the `metric` for `CartPole-v1`.
1. Each *trial* then averages the normalized scores $z_i$ of the three *experiments* to a number $z$ and the tuner optimizes $z$ according `direction="maximize"`.
1. Each *trial* samples a set of hyperparameters from the `params_fn` to run `num_seeds=3` *experiments* with different random seeds, mitigating the impact of randomness on the results.
* In each *experiment*, `tuner_example.py` averages the last `metric_last_n_average_window=50` reported `metric="charts/episodic_return"` to a number $x_i$ and calculate a normalized score $z_i$ according to the `target_scores`. We will talk more about `target_scores` which is useful when working with multiple environments in the next section, but in this section we just set `None` as the target score for `CartPole-v1` — this means we will set the normalized score $z_i = x_i$.
1. Each *trial* then averages the normalized scores $z_i$ of the `num_seeds=3` *experiments* to a number $z$ and the tuner optimizes $z$ according `direction="maximize"`.


## Visualization
Expand All @@ -59,18 +60,23 @@ poetry run optuna-dashboard sqlite:///cleanrl_hpopt.db
You can use a different database by passing `Tuner(..., storage="mysql://root@localhost/example")`, for example.




## Work w/ multiple environments

`Tuner` supports finding a set of hyper parameters of that works well against multiple environments by extending `target_scores`. In the following example, each trial uses a set of hyperparameters to run experiments with 3 random seeds for each environment in `["CartPole-v1","Acrobot-v1"]`, totalling `2*3=6` experiments per trial.
Often it is useful to find a single set of hyper parameters that work well with multiple environments, but this is challenging **because each environment may have different reward scales**, so we need to do normalization.

This is where `target_scores` comes in. We can use it to specify an upper and lower threshold of rewards for each environment (they don't have to be exact boundaries and can be just ballpark estimates). For example, if we want to find a set of hyper parameters that work well with `CartPole-v1` and `Acrobot-v1`, we can set the `target_scores` and `aggregation_type` as follows:


```python title="tuner_example_multi_env.py" hl_lines="8 9"
```python title="tuner_example_multi_env.py" hl_lines="7-11"
from cleanrl_utils.tuner import Tuner
tuner = Tuner(
script="cleanrl/ppo.py",
metric="charts/episodic_return",
metric_last_n_average_window=50,
direction="maximize",
aggregation_type="average",
target_scores={
"CartPole-v1": [0, 500],
"Acrobot-v1": [-500, 0],
Expand All @@ -92,17 +98,23 @@ tuner.tune(
)
```

Then each experiment will calculate the normalized scores $z_{i_0} = (x_{i_0} - 0) / (500 - 0), z_{i_1} = (x_{i_1} - -500) / (0 - -500)$ and the tuner will `aggregation_type="average"` the two scores to get the normalized score $z_i = (z_{i_0} + z_{i_1}) / 2$.

Note that we are using 3 random seeds for each environment in `["CartPole-v1","Acrobot-v1"]`, totalling `2*3=6` experiments per trial.



???+ info

When optimizing Atari games, you can use `target_scores` as the human normalized scores in (Mnih et al., 2015, Extended Data Table 2)[^1], such as
When optimizing Atari games, you can put the human normalized scores in `target_scores` (Mnih et al., 2015, Extended Data Table 2)[^1], as done in the following example. The first number for each environment is the score obtained by random play and the second number is the score obtained by professional game testers.

```python
tuner = Tuner(
script="cleanrl/ppo_atari.py",
metric="charts/episodic_return",
metric_last_n_average_window=50,
direction="maximize",
aggregation_type="average",
target_scores={
"Alien-v5": [227.8, 6875],
"Amidar-v5": [5.8, 1676],
Expand All @@ -120,14 +132,15 @@ tuner.tune(

You can use `Tuner` with any [pruner](https://optuna.readthedocs.io/en/stable/reference/pruners.html) from `optuna` to prune less promising experiments or [samplers](https://optuna.readthedocs.io/en/stable/reference/samplers.html) to sample new hyperparameters. If you don't specify them explicitly, the script will use the [default ones](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.create_study.html).

```python title="tuner_example_pruner.py" hl_lines="1 22 23"
```python title="tuner_example_pruner.py" hl_lines="1 23 24"
import optuna
from cleanrl_utils.tuner import Tuner
tuner = Tuner(
script="cleanrl/ppo.py",
metric="charts/episodic_return",
metric_last_n_average_window=50,
direction="maximize",
aggregation_type="average",
target_scores={
"CartPole-v1": [0, 500],
"Acrobot-v1": [-500, 0],
Expand Down Expand Up @@ -157,14 +170,15 @@ tuner.tune(
The `Tuner` can track all the experiments into [Weights and Biases](https://wandb.ai) to help you visualize the progress of the tuning.


```python title="tuner_example.py" hl_lines="24"
```python title="tuner_example.py" hl_lines="25"
import optuna
from cleanrl_utils.tuner import Tuner
tuner = Tuner(
script="cleanrl/ppo.py",
metric="charts/episodic_return",
metric_last_n_average_window=50,
direction="maximize",
aggregation_type="average",
target_scores={
"CartPole-v1": [0, 500],
"Acrobot-v1": [-500, 0],
Expand Down
Binary file added docs/advanced/optuna-dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion tuner_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
"num-envs": 16,
},
pruner=optuna.pruners.MedianPruner(n_startup_trials=5),
wandb_kwargs={"project": "cleanrl"},
sampler=optuna.samplers.TPESampler(),
# wandb_kwargs={"project": "cleanrl"},
)
tuner.tune(
num_trials=10,
Expand Down