Feature/a2c benchmarks (#266)

* feat: added benchmarks * feat: update benchmarks
Eclectic-Sheep · Apr 17, 2024 · e692606 · e692606
1 parent 31a8e9b
commit e692606
Show file tree

Hide file tree

Showing 15 changed files with 154 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -91,6 +91,7 @@ The training times of our implementations compared to the ones of Stable Baselin
   <th>SheepRL v0.4.0</th>
   <th>SheepRL v0.4.9</th>
   <th>SheepRL v0.5.2<br />(Numpy Buffers)</th>
+  <th>SheepRL v0.5.5<br />(Numpy Buffers)</th>
   <th>StableBaselines3<sup>1</sup></th>
   </tr>
  </thead>
@@ -101,13 +102,32 @@ The training times of our implementations compared to the ones of Stable Baselin
   <td>192.31s &plusmn; 1.11</td>
   <td>138.3s &plusmn; 0.16</td>
   <td>80.81s &plusmn; 0.68</td>
+  <td>81.27s &plusmn; 0.47</td>
   <td>77.21s &plusmn; 0.36</td>
   </tr>
   <tr>
   <td><i>2 devices</i></td>
   <td>85.42s &plusmn; 2.27</td>
   <td>59.53s &plusmn; 0.78</td>
   <td>46.09s &plusmn; 0.59</td>
+  <td>36.88s &plusmn; 0.30</td>
+  <td>N.D.</td>
+  </tr>
+  <tr>
+  <td rowspan="2"><b>A2C</b></td>
+  <td><i>1 device</i></td>
+  <td>N.D.</td>
+  <td>N.D.</td>
+  <td>N.D.</td>
+  <td>84.76s &plusmn; 0.37</td>
+  <td>84.22s &plusmn; 0.99</td>
+  </tr>
+  <tr>
+  <td><i>2 devices</i></td>
+  <td>N.D.</td>
+  <td>N.D.</td>
+  <td>N.D.</td>
+  <td>28.95s &plusmn; 0.75</td>
   <td>N.D.</td>
   </tr>
   <tr>
@@ -116,13 +136,15 @@ The training times of our implementations compared to the ones of Stable Baselin
   <td>421.37s &plusmn; 5.27</td>
   <td>363.74s &plusmn; 3.44</td>
   <td>318.06s &plusmn; 4.46</td>
+  <td>320.21 &plusmn; 6.29</td>
   <td>336.06s &plusmn; 12.26</td>
   </tr>
   <tr>
   <td><i>2 devices</i></td>
   <td>264.29s &plusmn; 1.81</td>
   <td>238.88s &plusmn; 4.97</td>
   <td>210.07s &plusmn; 27</td>
+  <td>225.95 &plusmn; 3.65</td>
   <td>N.D.</td>
   </tr>
   <tr>
@@ -131,6 +153,7 @@ The training times of our implementations compared to the ones of Stable Baselin
   <td>4201.23s</td>
   <td>N.D.</td>
   <td>2921.38s</td>
+  <td>2207.13s</td>
   <td>N.D.</td>
   </tr>
   <tr>
@@ -139,6 +162,7 @@ The training times of our implementations compared to the ones of Stable Baselin
   <td>1874.62s</td>
   <td>N.D.</td>
   <td>1148.1s</td>
+  <td>906.42s</td>
   <td>N.D.</td>
   </tr>
   <tr>
@@ -147,6 +171,7 @@ The training times of our implementations compared to the ones of Stable Baselin
   <td>2022.99s</td>
   <td>N.D.</td>
   <td>1378.01s</td>
+  <td>1589.30s</td>
   <td>N.D.</td>
   </tr>
  </tbody>

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -17,6 +17,17 @@
  # "algo.per_rank_batch_size=128"
  ]
 
+ # A2C Arguments
+ # args = [
+ # os.path.join(ROOT_DIR, "__main__.py"),
+ # "exp=a2c_benchmarks",
+ # # Decomment below to run with 2 devices
+ # # "fabric.devices=2",
+ # # "env.num_envs=2",
+ # # "algo.per_rank_batch_size=10",
+ # # "algo.rollout_steps=20",
+ # ]
+
  # SAC Arguments
  # args = [
  # os.path.join(ROOT_DIR, "__main__.py"),

diff --git a/benchmarks/benchmark_sb3.py b/benchmarks/benchmark_sb3.py
@@ -1,6 +1,6 @@
 import gymnasium as gym
 import stable_baselines3 as sb3
-from stable_baselines3 import PPO, SAC # noqa: F401
+from stable_baselines3 import A2C, PPO, SAC # noqa: F401
 from torchmetrics import SumMetric
 
 from sheeprl.utils.timer import timer
@@ -15,6 +15,18 @@
  print(sb3.common.evaluation.evaluate_policy(model.policy, env))
 
 
+# Stable Baselines3 - A2C - CartPolev1
+# Decomment below to run A2C benchmarks
+
+# if __name__ == "__main__":
+# with timer("run_time", SumMetric, sync_on_compute=False):
+# env = gym.make("CartPole-v1", render_mode="rgb_array")
+# model = A2C("MlpPolicy", env, verbose=0, device="cpu", vf_coef=1.0)
+# model.learn(total_timesteps=1024 * 64, log_interval=None)
+# print(timer.compute())
+# print(sb3.common.evaluation.evaluate_policy(model.policy, env))
+
+
 # Stable Baselines3 SAC - LunarLanderContinuous-v2
 # Decomment below to run SAC benchmarks
 
@@ -23,7 +35,7 @@
 # env = sb3.common.vec_env.DummyVecEnv(
 # [lambda: gym.make("LunarLanderContinuous-v2", render_mode="rgb_array") for _ in range(4)]
 # )
-# model = SAC("MlpPolicy", env, verbose=0, device="cpu", ent_coef=1.0)
+# model = SAC("MlpPolicy", env, verbose=0, device="cpu")
 # model.learn(total_timesteps=1024 * 64, log_interval=None)
 # print(timer.compute())
 # print(sb3.common.evaluation.evaluate_policy(model.policy, env.envs[0]))
diff --git a/examples/ratio.py b/examples/ratio.py
@@ -1,55 +1,9 @@
-import warnings
-from typing import Any, Dict, Mapping
-
-
-class Ratio:
- """Directly taken from Hafner et al. (2023) implementation:
- https://github.com/danijar/dreamerv3/blob/8fa35f83eee1ce7e10f3dee0b766587d0a713a60/dreamerv3/embodied/core/when.py#L26
- """
-
- def __init__(self, ratio: float, pretrain_steps: int = 0):
- if pretrain_steps < 0:
- raise ValueError(f"'pretrain_steps' must be non-negative, got {pretrain_steps}")
- if ratio < 0:
- raise ValueError(f"'ratio' must be non-negative, got {ratio}")
- self._pretrain_steps = pretrain_steps
- self._ratio = ratio
- self._prev = None
-
- def __call__(self, step: int) -> int:
- if self._ratio == 0:
- return 0
- if self._prev is None:
- self._prev = step
- repeats = 1
- if self._pretrain_steps > 0:
- if step < self._pretrain_steps:
- warnings.warn(
- "The number of pretrain steps is greater than the number of current steps. This could lead to "
- f"a higher ratio than the one specified ({self._ratio}). Setting the 'pretrain_steps' equal to "
- "the number of current steps."
- )
- self._pretrain_steps = step
- repeats = round(self._pretrain_steps * self._ratio)
- return repeats
- repeats = round((step - self._prev) * self._ratio)
- self._prev += repeats / self._ratio
- return repeats
-
- def state_dict(self) -> Dict[str, Any]:
- return {"_ratio": self._ratio, "_prev": self._prev, "_pretrain_steps": self._pretrain_steps}
-
- def load_state_dict(self, state_dict: Mapping[str, Any]):
- self._ratio = state_dict["_ratio"]
- self._prev = state_dict["_prev"]
- self._pretrain_steps = state_dict["_pretrain_steps"]
- return self
-
+from sheeprl.utils.utils import Ratio
 
 if __name__ == "__main__":
  num_envs = 1
  world_size = 1
- replay_ratio = 0.5
+ replay_ratio = 0.0625
  per_rank_batch_size = 16
  per_rank_sequence_length = 64
  replayed_steps = world_size * per_rank_batch_size * per_rank_sequence_length
@@ -62,7 +16,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any]):
  for i in range(0, total_policy_steps, policy_steps):
  if i >= 128:
  per_rank_repeats = r(i / world_size)
- if per_rank_repeats > 0 and not printed:
+ if per_rank_repeats > 0: # and not printed:
  print(
  f"Training the agent with {per_rank_repeats} repeats on every rank "
  f"({per_rank_repeats * world_size} global repeats) at global iteration {i}"

diff --git a/sheeprl/algos/a2c/a2c.py b/sheeprl/algos/a2c/a2c.py
@@ -358,7 +358,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
  if (
  (cfg.checkpoint.every > 0 and policy_step - last_checkpoint >= cfg.checkpoint.every)
  or cfg.dry_run
- or update == num_updates
+ or (update == num_updates and cfg.checkpoint.save_last)
  ):
  last_checkpoint = policy_step
  state = {
@@ -370,7 +370,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
  fabric.call("on_checkpoint_coupled", fabric=fabric, ckpt_path=ckpt_path, state=state)
 
  envs.close()
- if fabric.is_global_zero:
+ if fabric.is_global_zero and cfg.algo.run_test:
  test(player, fabric, cfg, log_dir)
 
  if not cfg.model_manager.disabled and fabric.is_global_zero:

diff --git a/sheeprl/algos/a2c/agent.py b/sheeprl/algos/a2c/agent.py
@@ -30,7 +30,7 @@ def __init__(
  super().__init__()
  self.keys = keys
  self.input_dim = input_dim
- self.output_dim = features_dim
+ self.output_dim = features_dim if features_dim else dense_units
  self.model = MLP(
  input_dim,
  features_dim,
@@ -96,18 +96,22 @@ def __init__(
  )
 
  # Actor
- actor_backbone = MLP(
- input_dims=features_dim,
- output_dim=None,
- hidden_sizes=[actor_cfg.dense_units] * actor_cfg.mlp_layers,
- activation=hydra.utils.get_class(actor_cfg.dense_act),
- flatten_dim=None,
- norm_layer=[nn.LayerNorm] * actor_cfg.mlp_layers if actor_cfg.layer_norm else None,
- norm_args=(
- [{"normalized_shape": actor_cfg.dense_units} for _ in range(actor_cfg.mlp_layers)]
- if actor_cfg.layer_norm
- else None
- ),
+ actor_backbone = (
+ MLP(
+ input_dims=features_dim,
+ output_dim=None,
+ hidden_sizes=[actor_cfg.dense_units] * actor_cfg.mlp_layers,
+ activation=hydra.utils.get_class(actor_cfg.dense_act),
+ flatten_dim=None,
+ norm_layer=[nn.LayerNorm] * actor_cfg.mlp_layers if actor_cfg.layer_norm else None,
+ norm_args=(
+ [{"normalized_shape": actor_cfg.dense_units} for _ in range(actor_cfg.mlp_layers)]
+ if actor_cfg.layer_norm
+ else None
+ ),
+ )
+ if actor_cfg.mlp_layers > 0
+ else nn.Identity()
  )
  if is_continuous:
  # Output is a tuple of two elements: mean and log_std, one for every action

diff --git a/sheeprl/algos/sac/sac.py b/sheeprl/algos/sac/sac.py
@@ -297,7 +297,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
  # Train the agent
  if update >= learning_starts:
- per_rank_gradient_steps = ratio(policy_step / world_size)
+ per_rank_gradient_steps = ratio(policy_step / world_size) if not cfg.run_benchmarks else 1
  if per_rank_gradient_steps > 0:
  # We sample one time to reduce the communications between processes
  sample = rb.sample_tensors(

diff --git a/sheeprl/configs/exp/a2c_benchmarks.yaml b/sheeprl/configs/exp/a2c_benchmarks.yaml
@@ -0,0 +1,59 @@
+# @package _global_
+
+defaults:
+ - override /algo: a2c
+ - override /env: gym
+ - _self_
+
+# Environment
+env:
+ capture_video: False
+ num_envs: 1
+ sync_env: True
+
+# Algorithm
+algo:
+ name: a2c
+ rollout_steps: 5
+ loss_reduction: mean
+ normalize_advantages: False
+ max_grad_norm: 0.5
+ encoder:
+ mlp_layers: 2
+ mlp_features_dim: null
+ actor:
+ mlp_layers: 0
+ critic:
+ mlp_layers: 0
+ optimizer:
+ lr: 7e-4
+ eps: 1e-5
+ alpha: 0.99
+ per_rank_batch_size: 5
+ # # If you want to run this benchmark with older versions,
+ # you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file.
+ run_test: False
+ # If you want to run this benchmark with older versions,
+ # you need to move the `total_steps` and the `mlp_keys` config from `algo` to the root.
+ total_steps: 65536
+ mlp_keys:
+ encoder: [state]
+
+# Buffer
+buffer:
+ share_data: False
+ size: ${algo.rollout_steps}
+ memmap: False
+
+fabric:
+ devices: 1
+ accelerator: cpu
+
+checkpoint:
+ every: 70000
+ save_last: False
+
+metric:
+ log_every: 70000
+ log_level: 0
+ disable_timer: True
diff --git a/sheeprl/configs/exp/default.yaml b/sheeprl/configs/exp/default.yaml
@@ -0,0 +1 @@
+run_benchmarks: False
diff --git a/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml
@@ -5,9 +5,6 @@ defaults:
  - override /env: atari
  - _self_
 
-# Experiment
-seed: 5
-
 # Environment
 env:
  num_envs: 1
@@ -26,6 +23,7 @@ buffer:
 # Algorithm
 algo:
  learning_starts: 1024
+ replay_ratio: 0.0625
 
  dense_units: 8
  mlp_layers: 1

diff --git a/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml
@@ -5,9 +5,6 @@ defaults:
  - override /env: atari
  - _self_
 
-# Experiment
-seed: 5
-
 # Environment
 env:
  num_envs: 1
@@ -26,10 +23,11 @@ buffer:
 # Algorithm
 algo:
  learning_starts: 1024
- per_rank_pretrain_steps: 1
+ per_rank_pretrain_steps: 0
+ replay_ratio: 0.0625
 
  dense_units: 8
- mlp_layers: 
+ mlp_layers: 1
  world_model:
  discrete_size: 4
  stochastic_size: 4

diff --git a/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml
@@ -5,9 +5,6 @@ defaults:
  - override /env: atari
  - _self_
 
-# Experiment
-seed: 5
-
 # Environment
 env:
  num_envs: 1
@@ -26,7 +23,7 @@ buffer:
 # Algorithm
 algo:
  learning_starts: 1024
- replay_ratio: 1
+ replay_ratio: 0.0625
  dense_units: 8
  mlp_layers: 1
  world_model: