From bb584f12d8d53a82adef1343df7fe992e053579f Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 14 Sep 2023 09:19:05 +0200 Subject: [PATCH 01/17] add gaussian noise option --- examples/ddpg/config.yaml | 26 +++++----- examples/ddpg/ddpg.py | 106 ++++++++++++++++++++++---------------- examples/ddpg/utils.py | 43 +++++++++++----- 3 files changed, 105 insertions(+), 70 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index 464632f8bf3..db360291fa1 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -1,17 +1,17 @@ -# Environment +# environment and task env: name: HalfCheetah-v3 task: "" - exp_name: "HalfCheetah-DDPG" + exp_name: "HalfCheetah-DDPG-ICLR" library: gym frame_skip: 1 seed: 1 -# Collection +# collector collector: - total_frames: 1000000 - init_random_frames: 10000 - frames_per_batch: 1000 + total_frames: 3_000_000 + init_random_frames: 25_000 + frames_per_batch: 1 max_frames_per_traj: 1000 init_env_steps: 1000 async_collection: 1 @@ -19,27 +19,29 @@ collector: env_per_collector: 1 num_workers: 1 -# Replay Buffer +# replay buffer replay_buffer: size: 1000000 prb: 0 # use prioritized experience replay -# Optimization -optimization: +# optimization +optim: utd_ratio: 1.0 gamma: 0.99 - loss_function: smooth_l1 + loss_function: l2 lr: 3e-4 - weight_decay: 2e-4 + weight_decay: 0.0 batch_size: 256 target_update_polyak: 0.995 +# network network: hidden_sizes: [256, 256] activation: relu device: "cuda:0" + noise_type: "gaussian" # ou or gaussian -# Logging +# logging logger: backend: wandb mode: online diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index b77494bc52f..7e35f03ac9c 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -11,6 +11,8 @@ The helper functions are coded in the utils.py associated with this script. """ +import time + import hydra import numpy as np @@ -33,6 +35,7 @@ def main(cfg: "DictConfig"): # noqa: F821 device = torch.device(cfg.network.device) + # Create Logger exp_name = generate_exp_name("DDPG", cfg.env.exp_name) logger = None if cfg.logger.backend: @@ -43,6 +46,7 @@ def main(cfg: "DictConfig"): # noqa: F821 wandb_kwargs={"mode": cfg.logger.mode, "config": cfg}, ) + # Set seeds torch.manual_seed(cfg.env.seed) np.random.seed(cfg.env.seed) @@ -55,56 +59,55 @@ def main(cfg: "DictConfig"): # noqa: F821 # Create Loss Module and Target Updater loss_module, target_net_updater = make_loss_module(cfg, model) - # Make Off-Policy Collector + # Create Off-Policy Collector collector = make_collector(cfg, train_env, exploration_policy) - # Make Replay Buffer + # Create Replay Buffer replay_buffer = make_replay_buffer( - batch_size=cfg.optimization.batch_size, + batch_size=cfg.optim.batch_size, prb=cfg.replay_buffer.prb, buffer_size=cfg.replay_buffer.size, device=device, ) - # Make Optimizers + # Create Optimizers optimizer_actor, optimizer_critic = make_optimizer(cfg, loss_module) - rewards = [] - rewards_eval = [] - # Main loop + start_time = time.time() collected_frames = 0 pbar = tqdm.tqdm(total=cfg.collector.total_frames) - r0 = None - q_loss = None init_random_frames = cfg.collector.init_random_frames num_updates = int( cfg.collector.env_per_collector * cfg.collector.frames_per_batch - * cfg.optimization.utd_ratio + * cfg.optim.utd_ratio ) prb = cfg.replay_buffer.prb - env_per_collector = cfg.collector.env_per_collector frames_per_batch, frame_skip = cfg.collector.frames_per_batch, cfg.env.frame_skip eval_iter = cfg.logger.eval_iter eval_rollout_steps = cfg.collector.max_frames_per_traj // frame_skip - for i, tensordict in enumerate(collector): + sampling_start = time.time() + for _, tensordict in enumerate(collector): + sampling_time = time.time() - sampling_start + # update exploration policy exploration_policy.step(tensordict.numel()) + # update weights of the inference policy collector.update_policy_weights_() - if r0 is None: - r0 = tensordict["next", "reward"].sum(-1).mean().item() pbar.update(tensordict.numel()) tensordict = tensordict.reshape(-1) current_frames = tensordict.numel() + # add to replay buffer replay_buffer.extend(tensordict.cpu()) collected_frames += current_frames # optimization steps + training_start = time.time() if collected_frames >= init_random_frames: ( actor_losses, @@ -114,19 +117,23 @@ def main(cfg: "DictConfig"): # noqa: F821 # sample from replay buffer sampled_tensordict = replay_buffer.sample().clone() + # compute loss loss_td = loss_module(sampled_tensordict) - optimizer_critic.zero_grad() - optimizer_actor.zero_grad() - actor_loss = loss_td["loss_actor"] q_loss = loss_td["loss_value"] - (actor_loss + q_loss).backward() + # update critic + optimizer_critic.zero_grad() + q_loss.backward() optimizer_critic.step() - q_losses.append(q_loss.item()) + # update actor + optimizer_actor.zero_grad() + actor_loss.backward() optimizer_actor.step() + + q_losses.append(q_loss.item()) actor_losses.append(actor_loss.item()) # update qnet_target params @@ -136,44 +143,53 @@ def main(cfg: "DictConfig"): # noqa: F821 if prb: replay_buffer.update_priority(sampled_tensordict) - rewards.append( - (i, tensordict["next", "reward"].sum().item() / env_per_collector) - ) - train_log = { - "train_reward": rewards[-1][1], - "collected_frames": collected_frames, - } - if q_loss is not None: - train_log.update( - { - "actor_loss": np.mean(actor_losses), - "q_loss": np.mean(q_losses), - } + training_time = time.time() - training_start + episode_rewards = tensordict["next", "episode_reward"][ + tensordict["next", "done"] + ] + + # logging + if len(episode_rewards) > 0: + episode_length = tensordict["next", "step_count"][ + tensordict["next", "done"] + ] + logger.log_scalar( + "train/reward", episode_rewards.mean().item(), collected_frames ) - if logger is not None: - for key, value in train_log.items(): - logger.log_scalar(key, value, step=collected_frames) + logger.log_scalar( + "train/episode_length", + episode_length.sum().item() / len(episode_length), + collected_frames, + ) + if collected_frames >= init_random_frames: + logger.log_scalar("train/q_loss", np.mean(q_losses), step=collected_frames) + logger.log_scalar( + "train/a_loss", np.mean(actor_losses), step=collected_frames + ) + logger.log_scalar("train/sampling_time", sampling_time, collected_frames) + logger.log_scalar("train/training_time", training_time, collected_frames) + + # evaluation if abs(collected_frames % eval_iter) < frames_per_batch * frame_skip: with set_exploration_type(ExplorationType.MODE), torch.no_grad(): + eval_start = time.time() eval_rollout = eval_env.rollout( eval_rollout_steps, exploration_policy, auto_cast_to_device=True, break_when_any_done=True, ) + eval_time = time.time() - eval_start eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item() - rewards_eval.append((i, eval_reward)) - eval_str = f"eval cumulative reward: {rewards_eval[-1][1]: 4.4f} (init: {rewards_eval[0][1]: 4.4f})" - if logger is not None: - logger.log_scalar( - "evaluation_reward", rewards_eval[-1][1], step=collected_frames - ) - if len(rewards_eval): - pbar.set_description( - f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f})," + eval_str - ) + logger.log_scalar("eval/reward", eval_reward, step=collected_frames) + logger.log_scalar("eval/time", eval_time, step=collected_frames) + + sampling_start = time.time() collector.shutdown() + end_time = time.time() + execution_time = end_time - start_time + print(f"Training took {execution_time:.2f} seconds to finish") if __name__ == "__main__": diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index ab4083fff28..18f2c85efae 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -10,12 +10,14 @@ EnvCreator, InitTracker, ParallelEnv, + RewardSum, TransformedEnv, ) from torchrl.envs.libs.gym import GymEnv from torchrl.envs.transforms import RewardScaling from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.modules import ( + AdditiveGaussianWrapper, MLP, OrnsteinUhlenbeckProcessWrapper, SafeModule, @@ -43,7 +45,8 @@ def apply_env_transforms(env, reward_scaling=1.0): Compose( InitTracker(), RewardScaling(loc=0.0, scale=reward_scaling), - DoubleToFloat(), + DoubleToFloat("observation"), + RewardSum(), ), ) return transformed_env @@ -80,6 +83,7 @@ def make_collector(cfg, train_env, actor_model_explore): train_env, actor_model_explore, frames_per_batch=cfg.collector.frames_per_batch, + init_random_frames=cfg.collector.init_random_frames, max_frames_per_traj=cfg.collector.max_frames_per_traj, total_frames=cfg.collector.total_frames, device=cfg.collector.collector_device, @@ -199,10 +203,23 @@ def make_ddpg_agent(cfg, train_env, eval_env, device): eval_env.close() # Exploration wrappers: - actor_model_explore = OrnsteinUhlenbeckProcessWrapper( - model[0], - annealing_num_steps=1_000_000, - ).to(device) + if cfg.network.noise_type == "ou": + actor_model_explore = OrnsteinUhlenbeckProcessWrapper( + model[0], + annealing_num_steps=1_000_000, + ).to(device) + elif cfg.network.noise_type == "gaussian": + actor_model_explore = AdditiveGaussianWrapper( + model[0], + sigma_end=1.0, + sigma_init=1.0, + mean=0.0, + std=0.1, + scale=cfg.network.noise_scale, + ).to(device) + else: + raise NotImplementedError + return model, actor_model_explore @@ -217,14 +234,14 @@ def make_loss_module(cfg, model): loss_module = DDPGLoss( actor_network=model[0], value_network=model[1], - loss_function=cfg.optimization.loss_function, + loss_function=cfg.optim.loss_function, + delay_actor=True, + delay_value=True, ) - loss_module.make_value_estimator(gamma=cfg.optimization.gamma) + loss_module.make_value_estimator(gamma=cfg.optim.gamma) # Define Target Network Updater - target_net_updater = SoftUpdate( - loss_module, eps=cfg.optimization.target_update_polyak - ) + target_net_updater = SoftUpdate(loss_module, eps=cfg.optim.target_update_polyak) return loss_module, target_net_updater @@ -233,11 +250,11 @@ def make_optimizer(cfg, loss_module): actor_params = list(loss_module.actor_network_params.flatten_keys().values()) optimizer_actor = optim.Adam( - actor_params, lr=cfg.optimization.lr, weight_decay=cfg.optimization.weight_decay + actor_params, lr=cfg.optim.lr, weight_decay=cfg.optim.weight_decay ) optimizer_critic = optim.Adam( critic_params, - lr=cfg.optimization.lr, - weight_decay=cfg.optimization.weight_decay, + lr=cfg.optim.lr, + weight_decay=cfg.optim.weight_decay, ) return optimizer_actor, optimizer_critic From 892a0fd3be32311497b0c9fee2a3398a647ea3b4 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 14 Sep 2023 09:21:57 +0200 Subject: [PATCH 02/17] fix --- examples/ddpg/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index 18f2c85efae..fabad2de049 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -215,7 +215,6 @@ def make_ddpg_agent(cfg, train_env, eval_env, device): sigma_init=1.0, mean=0.0, std=0.1, - scale=cfg.network.noise_scale, ).to(device) else: raise NotImplementedError From 81c2d99aa9fb8042a3af1d02cce11e8e6f8fa99c Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 14 Sep 2023 15:32:26 +0200 Subject: [PATCH 03/17] fix naming --- examples/ddpg/ddpg.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index 7e35f03ac9c..5680819812c 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -35,7 +35,7 @@ def main(cfg: "DictConfig"): # noqa: F821 device = torch.device(cfg.network.device) - # Create Logger + # Create logger exp_name = generate_exp_name("DDPG", cfg.env.exp_name) logger = None if cfg.logger.backend: @@ -50,19 +50,19 @@ def main(cfg: "DictConfig"): # noqa: F821 torch.manual_seed(cfg.env.seed) np.random.seed(cfg.env.seed) - # Create Environments + # Create environments train_env, eval_env = make_environment(cfg) - # Create Agent + # Create agent model, exploration_policy = make_ddpg_agent(cfg, train_env, eval_env, device) - # Create Loss Module and Target Updater + # Create DDPG loss loss_module, target_net_updater = make_loss_module(cfg, model) - # Create Off-Policy Collector + # Create off-policy collector collector = make_collector(cfg, train_env, exploration_policy) - # Create Replay Buffer + # Create replay buffer replay_buffer = make_replay_buffer( batch_size=cfg.optim.batch_size, prb=cfg.replay_buffer.prb, @@ -70,7 +70,7 @@ def main(cfg: "DictConfig"): # noqa: F821 device=device, ) - # Create Optimizers + # Create optimizers optimizer_actor, optimizer_critic = make_optimizer(cfg, loss_module) # Main loop @@ -92,21 +92,21 @@ def main(cfg: "DictConfig"): # noqa: F821 sampling_start = time.time() for _, tensordict in enumerate(collector): sampling_time = time.time() - sampling_start - # update exploration policy + # Update exploration policy exploration_policy.step(tensordict.numel()) - # update weights of the inference policy + # Update weights of the inference policy collector.update_policy_weights_() pbar.update(tensordict.numel()) tensordict = tensordict.reshape(-1) current_frames = tensordict.numel() - # add to replay buffer + # Add to replay buffer replay_buffer.extend(tensordict.cpu()) collected_frames += current_frames - # optimization steps + # Optimization steps training_start = time.time() if collected_frames >= init_random_frames: ( @@ -114,21 +114,21 @@ def main(cfg: "DictConfig"): # noqa: F821 q_losses, ) = ([], []) for _ in range(num_updates): - # sample from replay buffer + # Sample from replay buffer sampled_tensordict = replay_buffer.sample().clone() - # compute loss + # Compute loss loss_td = loss_module(sampled_tensordict) actor_loss = loss_td["loss_actor"] q_loss = loss_td["loss_value"] - # update critic + # Update critic optimizer_critic.zero_grad() q_loss.backward() optimizer_critic.step() - # update actor + # Update actor optimizer_actor.zero_grad() actor_loss.backward() optimizer_actor.step() @@ -136,10 +136,10 @@ def main(cfg: "DictConfig"): # noqa: F821 q_losses.append(q_loss.item()) actor_losses.append(actor_loss.item()) - # update qnet_target params + # Update qnet_target params target_net_updater.step() - # update priority + # Update priority if prb: replay_buffer.update_priority(sampled_tensordict) @@ -148,7 +148,7 @@ def main(cfg: "DictConfig"): # noqa: F821 tensordict["next", "done"] ] - # logging + # Logging if len(episode_rewards) > 0: episode_length = tensordict["next", "step_count"][ tensordict["next", "done"] @@ -169,7 +169,7 @@ def main(cfg: "DictConfig"): # noqa: F821 logger.log_scalar("train/sampling_time", sampling_time, collected_frames) logger.log_scalar("train/training_time", training_time, collected_frames) - # evaluation + # Evaluation if abs(collected_frames % eval_iter) < frames_per_batch * frame_skip: with set_exploration_type(ExplorationType.MODE), torch.no_grad(): eval_start = time.time() From 998f034259497f77d4ce8506281fe694099e91b1 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 14 Sep 2023 15:33:24 +0200 Subject: [PATCH 04/17] update config --- examples/ddpg/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index db360291fa1..30b219122d9 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -39,7 +39,7 @@ network: hidden_sizes: [256, 256] activation: relu device: "cuda:0" - noise_type: "gaussian" # ou or gaussian + noise_type: "ou" # ou or gaussian # logging logger: From 771137be21332f0cd1fdac251c33edeb84d44a4a Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 15 Sep 2023 08:59:32 +0200 Subject: [PATCH 05/17] update logging --- examples/ddpg/config.yaml | 4 ++-- examples/ddpg/ddpg.py | 29 ++++++++++++++--------------- examples/ddpg/utils.py | 32 +++++++++++++++++++++----------- 3 files changed, 37 insertions(+), 28 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index 30b219122d9..f30dc39182c 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -2,7 +2,7 @@ env: name: HalfCheetah-v3 task: "" - exp_name: "HalfCheetah-DDPG-ICLR" + exp_name: "HalfCheetah-DDPG" library: gym frame_skip: 1 seed: 1 @@ -11,7 +11,7 @@ env: collector: total_frames: 3_000_000 init_random_frames: 25_000 - frames_per_batch: 1 + frames_per_batch: 1000 max_frames_per_traj: 1000 init_env_steps: 1000 async_collection: 1 diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index 5680819812c..1a41dc9adb5 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -19,9 +19,11 @@ import torch import torch.cuda import tqdm + from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.record.loggers import generate_exp_name, get_logger from utils import ( + log_metrics, make_collector, make_ddpg_agent, make_environment, @@ -149,25 +151,21 @@ def main(cfg: "DictConfig"): # noqa: F821 ] # Logging + metrics_to_log = {} if len(episode_rewards) > 0: episode_length = tensordict["next", "step_count"][ tensordict["next", "done"] ] - logger.log_scalar( - "train/reward", episode_rewards.mean().item(), collected_frames - ) - logger.log_scalar( - "train/episode_length", - episode_length.sum().item() / len(episode_length), - collected_frames, + metrics_to_log["train/reward"] = episode_rewards.mean().item() + metrics_to_log["train/episode_length"] = episode_length.sum().item() / len( + episode_length ) + if collected_frames >= init_random_frames: - logger.log_scalar("train/q_loss", np.mean(q_losses), step=collected_frames) - logger.log_scalar( - "train/a_loss", np.mean(actor_losses), step=collected_frames - ) - logger.log_scalar("train/sampling_time", sampling_time, collected_frames) - logger.log_scalar("train/training_time", training_time, collected_frames) + metrics_to_log["train/q_loss"] = np.mean(q_losses) + metrics_to_log["train/a_loss"] = np.mean(actor_losses) + metrics_to_log["train/sampling_time"] = sampling_time + metrics_to_log["train/training_time"] = training_time # Evaluation if abs(collected_frames % eval_iter) < frames_per_batch * frame_skip: @@ -181,9 +179,10 @@ def main(cfg: "DictConfig"): # noqa: F821 ) eval_time = time.time() - eval_start eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item() - logger.log_scalar("eval/reward", eval_reward, step=collected_frames) - logger.log_scalar("eval/time", eval_time, step=collected_frames) + metrics_to_log["eval/reward"] = eval_reward + metrics_to_log["eval/time"] = eval_time + log_metrics(logger, metrics_to_log, collected_frames) sampling_start = time.time() collector.shutdown() diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index fabad2de049..14a2f79ec69 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -132,17 +132,6 @@ def make_replay_buffer( # ----- -def get_activation(cfg): - if cfg.network.activation == "relu": - return nn.ReLU - elif cfg.network.activation == "tanh": - return nn.Tanh - elif cfg.network.activation == "leaky_relu": - return nn.LeakyReLU - else: - raise NotImplementedError - - def make_ddpg_agent(cfg, train_env, eval_env, device): """Make DDPG agent.""" # Define Actor Network @@ -257,3 +246,24 @@ def make_optimizer(cfg, loss_module): weight_decay=cfg.optim.weight_decay, ) return optimizer_actor, optimizer_critic + + +# ==================================================================== +# General utils +# --------- + + +def log_metrics(logger, metrics, step): + for metric_name, metric_value in metrics.items(): + logger.log_scalar(metric_name, metric_value, step) + + +def get_activation(cfg): + if cfg.network.activation == "relu": + return nn.ReLU + elif cfg.network.activation == "tanh": + return nn.Tanh + elif cfg.network.activation == "leaky_relu": + return nn.LeakyReLU + else: + raise NotImplementedError From c1baebf95a8c181b9b7ec35f3ae42eebb864ac1f Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 19 Sep 2023 11:35:56 +0200 Subject: [PATCH 06/17] update tests --- .github/unittest/linux_examples/scripts/run_test.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/unittest/linux_examples/scripts/run_test.sh b/.github/unittest/linux_examples/scripts/run_test.sh index d81e90fdd42..bb7bc30a81b 100755 --- a/.github/unittest/linux_examples/scripts/run_test.sh +++ b/.github/unittest/linux_examples/scripts/run_test.sh @@ -63,13 +63,13 @@ python .github/unittest/helpers/coverage_run_parallel.py examples/ppo/ppo.py \ python .github/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py \ collector.total_frames=48 \ collector.init_random_frames=10 \ - optimization.batch_size=10 \ + optim.batch_size=10 \ collector.frames_per_batch=16 \ collector.num_workers=4 \ collector.env_per_collector=2 \ collector.collector_device=cuda:0 \ network.device=cuda:0 \ - optimization.utd_ratio=1 \ + optim.utd_ratio=1 \ replay_buffer.size=120 \ env.name=Pendulum-v1 \ logger.backend= @@ -175,13 +175,13 @@ python .github/unittest/helpers/coverage_run_parallel.py examples/dreamer/dreame python .github/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py \ collector.total_frames=48 \ collector.init_random_frames=10 \ - optimization.batch_size=10 \ + optim.batch_size=10 \ collector.frames_per_batch=16 \ collector.num_workers=2 \ collector.env_per_collector=1 \ collector.collector_device=cuda:0 \ network.device=cuda:0 \ - optimization.utd_ratio=1 \ + optim.utd_ratio=1 \ replay_buffer.size=120 \ env.name=Pendulum-v1 \ logger.backend= From 88ac2073a2c5c5b11e3ed71649231ae6839c3951 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 21 Sep 2023 11:31:20 +0200 Subject: [PATCH 07/17] update config and tests --- .github/unittest/linux_examples/scripts/run_test.sh | 2 -- examples/ddpg/config.yaml | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/unittest/linux_examples/scripts/run_test.sh b/.github/unittest/linux_examples/scripts/run_test.sh index bb7bc30a81b..1b04ba1cde1 100755 --- a/.github/unittest/linux_examples/scripts/run_test.sh +++ b/.github/unittest/linux_examples/scripts/run_test.sh @@ -65,7 +65,6 @@ python .github/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py \ collector.init_random_frames=10 \ optim.batch_size=10 \ collector.frames_per_batch=16 \ - collector.num_workers=4 \ collector.env_per_collector=2 \ collector.collector_device=cuda:0 \ network.device=cuda:0 \ @@ -177,7 +176,6 @@ python .github/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py \ collector.init_random_frames=10 \ optim.batch_size=10 \ collector.frames_per_batch=16 \ - collector.num_workers=2 \ collector.env_per_collector=1 \ collector.collector_device=cuda:0 \ network.device=cuda:0 \ diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index f30dc39182c..c23bff59524 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -14,10 +14,9 @@ collector: frames_per_batch: 1000 max_frames_per_traj: 1000 init_env_steps: 1000 - async_collection: 1 collector_device: cpu env_per_collector: 1 - num_workers: 1 + # replay buffer replay_buffer: From a5a27ac7bf80496956da0f98cb26e61df8168147 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 21 Sep 2023 11:41:39 +0200 Subject: [PATCH 08/17] update set_gym_backend header --- examples/ddpg/utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index 14a2f79ec69..fe4ed042749 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -1,3 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. import torch from torch import nn, optim @@ -13,7 +17,7 @@ RewardSum, TransformedEnv, ) -from torchrl.envs.libs.gym import GymEnv +from torchrl.envs.libs.gym import GymEnv, set_gym_backend from torchrl.envs.transforms import RewardScaling from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.modules import ( @@ -36,7 +40,10 @@ def env_maker(task, frame_skip=1, device="cpu", from_pixels=False): - return GymEnv(task, device=device, frame_skip=frame_skip, from_pixels=from_pixels) + with set_gym_backend("gym"): + return GymEnv( + task, device=device, frame_skip=frame_skip, from_pixels=from_pixels + ) def apply_env_transforms(env, reward_scaling=1.0): From b7458979d52311c04f1cbfcab1855bf07d4deaf6 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 22 Sep 2023 15:05:26 +0200 Subject: [PATCH 09/17] fix max_episode_steps --- examples/ddpg/config.yaml | 1 + examples/ddpg/ddpg.py | 11 ++++++----- examples/ddpg/utils.py | 22 ++++++++++++++++++---- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index c23bff59524..9c42c28a38c 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -5,6 +5,7 @@ env: exp_name: "HalfCheetah-DDPG" library: gym frame_skip: 1 + max_episode_steps: 1_000_000 seed: 1 # collector diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index 1a41dc9adb5..23b384fe053 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -146,16 +146,17 @@ def main(cfg: "DictConfig"): # noqa: F821 replay_buffer.update_priority(sampled_tensordict) training_time = time.time() - training_start - episode_rewards = tensordict["next", "episode_reward"][ + episode_end = ( tensordict["next", "done"] - ] + if tensordict["next", "done"].any() + else tensordict["next", "truncated"] + ) + episode_rewards = tensordict["next", "episode_reward"][episode_end] # Logging metrics_to_log = {} if len(episode_rewards) > 0: - episode_length = tensordict["next", "step_count"][ - tensordict["next", "done"] - ] + episode_length = tensordict["next", "step_count"][episode_end] metrics_to_log["train/reward"] = episode_rewards.mean().item() metrics_to_log["train/episode_length"] = episode_length.sum().item() / len( episode_length diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index fe4ed042749..66882d5492c 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -39,10 +39,16 @@ # ----------------- -def env_maker(task, frame_skip=1, device="cpu", from_pixels=False): +def env_maker( + task, frame_skip=1, device="cpu", from_pixels=False, max_episode_steps=1000 +): with set_gym_backend("gym"): return GymEnv( - task, device=device, frame_skip=frame_skip, from_pixels=from_pixels + task, + device=device, + frame_skip=frame_skip, + from_pixels=from_pixels, + max_episode_steps=max_episode_steps, ) @@ -63,7 +69,11 @@ def make_environment(cfg): """Make environments for training and evaluation.""" parallel_env = ParallelEnv( cfg.collector.env_per_collector, - EnvCreator(lambda: env_maker(task=cfg.env.name)), + EnvCreator( + lambda: env_maker( + task=cfg.env.name, max_episode_steps=cfg.env.max_episode_steps + ) + ), ) parallel_env.set_seed(cfg.env.seed) @@ -72,7 +82,11 @@ def make_environment(cfg): eval_env = TransformedEnv( ParallelEnv( cfg.collector.env_per_collector, - EnvCreator(lambda: env_maker(task=cfg.env.name)), + EnvCreator( + lambda: env_maker( + task=cfg.env.name, max_episode_steps=cfg.env.max_episode_steps + ) + ), ), train_env.transform.clone(), ) From 58fd4824d9dd9cdf0d77c487d75e15f5672d7751 Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 27 Sep 2023 09:48:22 +0200 Subject: [PATCH 10/17] benchmark changes --- examples/ddpg/config.yaml | 9 +++++---- examples/ddpg/utils.py | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index 9c42c28a38c..353011fd1e4 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -1,20 +1,21 @@ # environment and task env: - name: HalfCheetah-v3 + name: Hopper-v3 task: "" - exp_name: "HalfCheetah-DDPG" + exp_name: "Hopper-DDPG" library: gym frame_skip: 1 - max_episode_steps: 1_000_000 + max_episode_steps: 1000 seed: 1 # collector collector: - total_frames: 3_000_000 + total_frames: 1_000_000 init_random_frames: 25_000 frames_per_batch: 1000 max_frames_per_traj: 1000 init_env_steps: 1000 + reset_at_each_iter: False collector_device: cpu env_per_collector: 1 diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index 66882d5492c..ef82bb805b5 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -106,6 +106,7 @@ def make_collector(cfg, train_env, actor_model_explore): frames_per_batch=cfg.collector.frames_per_batch, init_random_frames=cfg.collector.init_random_frames, max_frames_per_traj=cfg.collector.max_frames_per_traj, + reset_at_each_iter=cfg.collector.reset_at_each_iter, total_frames=cfg.collector.total_frames, device=cfg.collector.collector_device, ) From c612f17097451f2445b5bf33eb245afd52056eb8 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 28 Sep 2023 16:08:05 +0200 Subject: [PATCH 11/17] frameskip, buffer scratch dir --- examples/ddpg/config.yaml | 12 ++++++------ examples/ddpg/ddpg.py | 7 ++++--- examples/ddpg/utils.py | 5 +---- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index 353011fd1e4..d77dd112f4a 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -1,12 +1,11 @@ # environment and task env: - name: Hopper-v3 + name: HalfCheetah-v3 task: "" - exp_name: "Hopper-DDPG" + exp_name: ${env.name}_DDPG library: gym - frame_skip: 1 - max_episode_steps: 1000 - seed: 1 + max_episode_steps: 5000 + seed: 6 # collector collector: @@ -24,13 +23,14 @@ collector: replay_buffer: size: 1000000 prb: 0 # use prioritized experience replay + scratch_dir: ${env.exp_name}_${env.seed} # optimization optim: utd_ratio: 1.0 gamma: 0.99 loss_function: l2 - lr: 3e-4 + lr: 1.0e-3 weight_decay: 0.0 batch_size: 256 target_update_polyak: 0.995 diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index 23b384fe053..49acf1f5cbe 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -69,6 +69,7 @@ def main(cfg: "DictConfig"): # noqa: F821 batch_size=cfg.optim.batch_size, prb=cfg.replay_buffer.prb, buffer_size=cfg.replay_buffer.size, + buffer_scratch_dir="/tmp/" + cfg.replay_buffer.scratch_dir, device=device, ) @@ -87,9 +88,9 @@ def main(cfg: "DictConfig"): # noqa: F821 * cfg.optim.utd_ratio ) prb = cfg.replay_buffer.prb - frames_per_batch, frame_skip = cfg.collector.frames_per_batch, cfg.env.frame_skip + frames_per_batch = cfg.collector.frames_per_batch eval_iter = cfg.logger.eval_iter - eval_rollout_steps = cfg.collector.max_frames_per_traj // frame_skip + eval_rollout_steps = cfg.collector.max_frames_per_traj sampling_start = time.time() for _, tensordict in enumerate(collector): @@ -169,7 +170,7 @@ def main(cfg: "DictConfig"): # noqa: F821 metrics_to_log["train/training_time"] = training_time # Evaluation - if abs(collected_frames % eval_iter) < frames_per_batch * frame_skip: + if abs(collected_frames % eval_iter) < frames_per_batch: with set_exploration_type(ExplorationType.MODE), torch.no_grad(): eval_start = time.time() eval_rollout = eval_env.rollout( diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index ef82bb805b5..7105ee5cdb6 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -39,14 +39,11 @@ # ----------------- -def env_maker( - task, frame_skip=1, device="cpu", from_pixels=False, max_episode_steps=1000 -): +def env_maker(task, device="cpu", from_pixels=False, max_episode_steps=1000): with set_gym_backend("gym"): return GymEnv( task, device=device, - frame_skip=frame_skip, from_pixels=from_pixels, max_episode_steps=max_episode_steps, ) From d6297c4876f7f0f3e970f23ff22d30753e0b822a Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 2 Oct 2023 10:33:11 +0200 Subject: [PATCH 12/17] update config fix --- examples/ddpg/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index d77dd112f4a..5739bf55a54 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -30,8 +30,8 @@ optim: utd_ratio: 1.0 gamma: 0.99 loss_function: l2 - lr: 1.0e-3 - weight_decay: 0.0 + lr: 3.0e-4 + weight_decay: 1e-4 batch_size: 256 target_update_polyak: 0.995 From 99c9bcd739bd46b9e309372e91bbf482d09be6c3 Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 2 Oct 2023 11:09:58 +0200 Subject: [PATCH 13/17] stepcount --- examples/ddpg/config.yaml | 1 - examples/ddpg/ddpg.py | 2 +- examples/ddpg/utils.py | 24 +++++++++--------------- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index 5739bf55a54..2b829db0a52 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -12,7 +12,6 @@ collector: total_frames: 1_000_000 init_random_frames: 25_000 frames_per_batch: 1000 - max_frames_per_traj: 1000 init_env_steps: 1000 reset_at_each_iter: False collector_device: cpu diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index 49acf1f5cbe..b0032fd4c53 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -90,7 +90,7 @@ def main(cfg: "DictConfig"): # noqa: F821 prb = cfg.replay_buffer.prb frames_per_batch = cfg.collector.frames_per_batch eval_iter = cfg.logger.eval_iter - eval_rollout_steps = cfg.collector.max_frames_per_traj + eval_rollout_steps = cfg.env.max_episode_steps sampling_start = time.time() for _, tensordict in enumerate(collector): diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index 7105ee5cdb6..b20084b9222 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -15,6 +15,7 @@ InitTracker, ParallelEnv, RewardSum, + StepCounter, TransformedEnv, ) from torchrl.envs.libs.gym import GymEnv, set_gym_backend @@ -39,21 +40,21 @@ # ----------------- -def env_maker(task, device="cpu", from_pixels=False, max_episode_steps=1000): +def env_maker(task, device="cpu", from_pixels=False): with set_gym_backend("gym"): return GymEnv( task, device=device, from_pixels=from_pixels, - max_episode_steps=max_episode_steps, ) -def apply_env_transforms(env, reward_scaling=1.0): +def apply_env_transforms(env, reward_scaling=1.0, max_episode_steps=1000): transformed_env = TransformedEnv( env, Compose( InitTracker(), + StepCounter(max_episode_steps), RewardScaling(loc=0.0, scale=reward_scaling), DoubleToFloat("observation"), RewardSum(), @@ -66,24 +67,18 @@ def make_environment(cfg): """Make environments for training and evaluation.""" parallel_env = ParallelEnv( cfg.collector.env_per_collector, - EnvCreator( - lambda: env_maker( - task=cfg.env.name, max_episode_steps=cfg.env.max_episode_steps - ) - ), + EnvCreator(lambda: env_maker(task=cfg.env.name)), ) parallel_env.set_seed(cfg.env.seed) - train_env = apply_env_transforms(parallel_env) + train_env = apply_env_transforms( + parallel_env, max_episode_steps=cfg.env.max_episode_steps + ) eval_env = TransformedEnv( ParallelEnv( cfg.collector.env_per_collector, - EnvCreator( - lambda: env_maker( - task=cfg.env.name, max_episode_steps=cfg.env.max_episode_steps - ) - ), + EnvCreator(lambda: env_maker(task=cfg.env.name)), ), train_env.transform.clone(), ) @@ -102,7 +97,6 @@ def make_collector(cfg, train_env, actor_model_explore): actor_model_explore, frames_per_batch=cfg.collector.frames_per_batch, init_random_frames=cfg.collector.init_random_frames, - max_frames_per_traj=cfg.collector.max_frames_per_traj, reset_at_each_iter=cfg.collector.reset_at_each_iter, total_frames=cfg.collector.total_frames, device=cfg.collector.collector_device, From c0a6433d0c9f8f504918939697f6f5dcf5ea687b Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 2 Oct 2023 18:21:45 +0200 Subject: [PATCH 14/17] update config --- examples/ddpg/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml index 2b829db0a52..5997ccb8fb3 100644 --- a/examples/ddpg/config.yaml +++ b/examples/ddpg/config.yaml @@ -3,9 +3,9 @@ env: name: HalfCheetah-v3 task: "" exp_name: ${env.name}_DDPG - library: gym - max_episode_steps: 5000 - seed: 6 + library: gymnasium + max_episode_steps: 1000 + seed: 42 # collector collector: From 22b120b5c83782e294100445fca2a2b72332f06d Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 3 Oct 2023 09:29:14 +0200 Subject: [PATCH 15/17] small fix --- examples/ddpg/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index b20084b9222..86a5c35c2d0 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -19,7 +19,6 @@ TransformedEnv, ) from torchrl.envs.libs.gym import GymEnv, set_gym_backend -from torchrl.envs.transforms import RewardScaling from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.modules import ( AdditiveGaussianWrapper, @@ -49,13 +48,12 @@ def env_maker(task, device="cpu", from_pixels=False): ) -def apply_env_transforms(env, reward_scaling=1.0, max_episode_steps=1000): +def apply_env_transforms(env, max_episode_steps=1000): transformed_env = TransformedEnv( env, Compose( InitTracker(), StepCounter(max_episode_steps), - RewardScaling(loc=0.0, scale=reward_scaling), DoubleToFloat("observation"), RewardSum(), ), From 096d6221454683a8c141556ea489e1c4078fc4f6 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 3 Oct 2023 09:15:30 -0400 Subject: [PATCH 16/17] amend --- examples/ddpg/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ddpg/utils.py b/examples/ddpg/utils.py index 86a5c35c2d0..5709c3ff59e 100644 --- a/examples/ddpg/utils.py +++ b/examples/ddpg/utils.py @@ -54,7 +54,7 @@ def apply_env_transforms(env, max_episode_steps=1000): Compose( InitTracker(), StepCounter(max_episode_steps), - DoubleToFloat("observation"), + DoubleToFloat(), RewardSum(), ), ) From d7bb73185b0b0c2ad61b85c03c2ec2b9d86079ba Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 3 Oct 2023 09:36:20 -0400 Subject: [PATCH 17/17] amend --- examples/ddpg/ddpg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ddpg/ddpg.py b/examples/ddpg/ddpg.py index b0032fd4c53..273947569be 100644 --- a/examples/ddpg/ddpg.py +++ b/examples/ddpg/ddpg.py @@ -183,8 +183,8 @@ def main(cfg: "DictConfig"): # noqa: F821 eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item() metrics_to_log["eval/reward"] = eval_reward metrics_to_log["eval/time"] = eval_time - - log_metrics(logger, metrics_to_log, collected_frames) + if logger is not None: + log_metrics(logger, metrics_to_log, collected_frames) sampling_start = time.time() collector.shutdown()