From 557431b6c9f353ae87bf6ad2c2e2746a59f3bb52 Mon Sep 17 00:00:00 2001 From: Jyotirmay Khavasi Date: Wed, 12 Jul 2023 20:23:05 +0530 Subject: [PATCH 1/5] Adding the Reinforcement Learning Template - Added 2 algorithms for Reinforcement Learning: REINFORCE and Advantage Actor Critic (A2C) - The Algorithms are implemented for the OpenAI Gym env: CarRacing-v2 --- .../advantage_actor_critic_a2c.py | 234 ++++++++++++++++++ .../config.yaml | 9 + .../reinforce.py | 199 +++++++++++++++ .../template-reinforcement-learning/utils.py | 1 + 4 files changed, 443 insertions(+) create mode 100644 src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py create mode 100644 src/templates/template-reinforcement-learning/config.yaml create mode 100644 src/templates/template-reinforcement-learning/reinforce.py create mode 100644 src/templates/template-reinforcement-learning/utils.py diff --git a/src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py b/src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py new file mode 100644 index 00000000..e817bbab --- /dev/null +++ b/src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py @@ -0,0 +1,234 @@ +from collections import deque, namedtuple + +from shutil import copy + +import ignite.distributed as idist +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +from ignite.engine import Engine, Events + +from ignite.utils import manual_seed + +from torch.distributions import Categorical + +from utils import * + +from typing import Any + +import numpy as np + +# from matplotlib import pyplot as plt + +try: + import gymnasium as gym +except ImportError: + raise ModuleNotFoundError("Please install opengym: pip install gymnasium[box2d]") + +SavedAction = namedtuple("SavedAction", ["log_prob", "value"]) + +eps = np.finfo(np.float32).eps.item() + + +class ActorCriticNetwork(nn.Module): + def __init__(self, n_actions): + super(ActorCriticNetwork, self).__init__() + self.LeakyReLU = nn.LeakyReLU() + self.Sigmoid = nn.Sigmoid() + self.Softplus = nn.Softplus() + + # REVIEW: + # OPTIMIZE: + self.conv1 = nn.Conv2d(3, 8, kernel_size=7, stride=4, padding=0) + self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=2) + self.pool = nn.MaxPool2d(kernel_size=2, stride=2) + + self.fc1 = nn.Linear(576, 512) + self.fc_critic2 = nn.Linear(512, 1) + self.fc_actor2 = nn.Linear(512, 256) + self.fc_actor3 = nn.Linear(256, n_actions) + + self.flatten = nn.Flatten() + + self.saved_actions = [] + self.rewards = [] + self.saved_log_probs = [] + + def forward(self, observation): + # state = torch.Tensor(observation).to(self.device) + + # Shared weights + x = self.LeakyReLU(self.conv1(observation)) + x = self.pool(x) + x = self.LeakyReLU(self.conv2(x)) + x = self.pool(x) + x = self.flatten(x) + x = self.fc1(x) + + # actor and critic + # actor + dist = self.LeakyReLU(self.fc_actor2(x)) + dist = self.Softplus(self.fc_actor3(dist)) + + actor = F.softmax(dist, dim=1) + + # critic + critic = self.fc_critic2(x) + + return actor, critic + + +# choose an action for the discrete actions +def choose_action(policy, observation): + observation = observation.float().unsqueeze(0) + state = torch.transpose(observation, 1, 3) + probabilities, value = policy(state) + # probabilities = F.softmax(probabilities) + + action_probs = Categorical(probabilities) + action = action_probs.sample() + + log_probs = action_probs.log_prob(action) + policy.saved_actions.append(SavedAction(log_probs, value)) + policy.saved_log_probs.append(log_probs) + + return action.item() + + +def learn(policy, optimizer, gamma): + R = 0 + saved_actions = policy.saved_actions + policy_losses = [] # list to save actor (policy) loss + value_losses = [] # list to save critic (value) loss + returns = deque() # list to save the true values + + for r in policy.rewards[::-1]: + # calculate the discounted value + R = r + gamma * R + returns.appendleft(R) + + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + eps) + + for (log_prob, value), R in zip(saved_actions, returns): + advantage = R - value.item() + + # calculate actor (policy) loss + policy_losses.append(-log_prob * advantage) + + # calculate critic (value) loss using L1 smooth loss + value_losses.append(F.smooth_l1_loss(value, torch.tensor([R]))) + + # reset gradients + optimizer.zero_grad() + + # sum up all the values of policy_losses and value_losses + loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum() + + # perform backprop + loss.backward() + optimizer.step() + # reset rewards and action buffer + del policy.rewards[:] + del policy.saved_actions[:] + + +EPISODE_STARTED = Events.EPOCH_STARTED +EPISODE_COMPLETED = Events.EPOCH_COMPLETED + + +def run(local_rank: int, env: Any, config: Any): + # make seed + rank = idist.get_rank() + manual_seed(config.seed + rank) + + # create output folder and copy config file to ouput dir + config.output_dir = setup_output_dir(config, rank) + if rank == 0: + copy(config.config, f"{config.output_dir}/config-lock.yaml") + + # create wrapper for saving video + if config.render: + + def trigger(episode): + return episode % config.save_every_episodes == 0 + + env = gym.wrappers.RecordVideo(env, config.recordings_path, trigger) + + # device, policy, optimizer + device = idist.device() + actor_critic = ActorCriticNetwork(env.action_space.n).to(device) + optimizer = idist.auto_optim(optim.Adam(actor_critic.parameters(), lr=config.lr, betas=(0.9, 0.999))) + + # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + # self.to(self.device) + timesteps = range(10000) + + def run_single_timestep(engine, timestep): + observation = engine.state.observation + + # select action from the policy + observation = torch.Tensor(observation).to(device) + action = choose_action(actor_critic, observation) + + engine.state.observation, reward, done, _, _ = env.step(action) + + if config.render: + env.render() + + actor_critic.rewards.append(reward) + engine.state.ep_reward += reward + if done: + engine.terminate_epoch() + engine.state.timestep = timestep + + trainer = Engine(run_single_timestep) + trainer.state.running_reward = 10 + + @trainer.on(EPISODE_STARTED) + def reset_environment_state(): + # reset environment and episode reward + torch.manual_seed(config.seed + trainer.state.epoch) + trainer.state.observation, _ = env.reset(seed=config.seed + trainer.state.epoch) + trainer.state.ep_reward = 0 + + @trainer.on(EPISODE_COMPLETED) + def update_model(): + # update cumulative reward + trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward + # perform backprop + learn(actor_critic, optimizer, config.gamma) + + @trainer.on(EPISODE_COMPLETED(every=config.log_every_episodes)) + def log_episode(): + i_episode = trainer.state.epoch + print( + f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}" + f"\tAverage reward: {trainer.state.running_reward:.2f}" + ) + + @trainer.on(EPISODE_COMPLETED) + def should_finish_training(): + # check if we have "solved" the cart pole problem + running_reward = trainer.state.running_reward + if running_reward > env.spec.reward_threshold: + print( + f"Solved! Running reward is now {running_reward} and " + f"the last episode runs to {trainer.state.timestep} time steps!" + ) + trainer.should_terminate = True + + trainer.run(timesteps, max_epochs=config.max_episodes) + + +def main(): + config = setup_config() + env = gym.make("CarRacing-v2", continuous=False, render_mode="rgb_array" if config.render else None) + with idist.Parallel(config.backend) as p: + p.run(run, env=env, config=config) + + +if __name__ == "__main__": + main() diff --git a/src/templates/template-reinforcement-learning/config.yaml b/src/templates/template-reinforcement-learning/config.yaml new file mode 100644 index 00000000..f0ce3224 --- /dev/null +++ b/src/templates/template-reinforcement-learning/config.yaml @@ -0,0 +1,9 @@ +seed: 666 +render: true +gamma: 0.99 +recordings_path: ./recordings +lr: 0.0003 +max_episodes: 10000 +log_every_episodes: 1 +save_every_episodes: 10 +output_dir: ./logs diff --git a/src/templates/template-reinforcement-learning/reinforce.py b/src/templates/template-reinforcement-learning/reinforce.py new file mode 100644 index 00000000..2df96153 --- /dev/null +++ b/src/templates/template-reinforcement-learning/reinforce.py @@ -0,0 +1,199 @@ +from collections import deque + +from shutil import copy + +import ignite.distributed as idist +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +from ignite.engine import Engine, Events + +from ignite.utils import manual_seed + +from torch.distributions import Categorical + +from utils import * + +from typing import Any + +import numpy as np + +try: + import gymnasium as gym +except ImportError: + raise ModuleNotFoundError("Please install opengym: pip install gymnasium[box2d]") + + +eps = np.finfo(np.float32).eps.item() + + +class Policy(nn.Module): + def __init__(self, state_dim, output_actions) -> None: + super(Policy, self).__init__() + + self.conv = nn.Sequential( + nn.Conv2d(state_dim, 32, kernel_size=3, stride=4), + nn.ReLU(), + nn.Conv2d(32, 64, kernel_size=1, stride=2), + nn.ReLU(), + nn.Conv2d(64, 64, kernel_size=1, stride=1), + nn.ReLU(), + nn.Flatten(), + ) + + conv_out_size = self._get_conv_out(state_dim) + + self.fc1 = nn.Linear(conv_out_size, 512) + self.fc2 = nn.Linear(512, 128) + self.fc3 = nn.Linear(128, output_actions) + + self.relu = nn.ReLU() + + self.saved_log_probs = [] + self.rewards = [] + + def forward(self, x): + x = self.conv(x) + # x = self.dp(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.relu(x) + action_scores = self.fc3(x) + return F.softmax(action_scores, dim=1) + + def _get_conv_out(self, shape): + x = torch.zeros(1, *shape) + x = self.conv(x) + + return int(np.prod(x.size())) + + +def choose_action(policy, observation): + state = torch.from_numpy(observation).float().unsqueeze(0) + probs = policy(state) + m = Categorical(probs) + action = m.sample() + policy.saved_log_probs.append(m.log_prob(action)) + return action.item() + + +def learn(policy, optimizer, gamma): + R = 0 + policy_loss = [] + returns = deque() + for r in policy.rewards[::-1]: + R = r + gamma * R + returns.appendleft(R) + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + eps) + for log_prob, R in zip(policy.saved_log_probs, returns): + policy_loss.append(-log_prob * R) + optimizer.zero_grad() + policy_loss = torch.cat(policy_loss).sum() + policy_loss.backward() + optimizer.step() + del policy.rewards[:] + del policy.saved_log_probs[:] + + +EPISODE_STARTED = Events.EPOCH_STARTED +EPISODE_COMPLETED = Events.EPOCH_COMPLETED + + +def run(local_rank: int, env: Any, config: Any): + # make seed + rank = idist.get_rank() + manual_seed(config.seed + rank) + + # create output folder and copy config file to output dir + config.output_dir = setup_output_dir(config, rank) + if rank == 0: + copy(config.config, f"{config.output_dir}/config-lock.yaml") + + # create wrapper for saving video + if config.render: + + def trigger(episode): + return episode % config.save_every_episode == 0 + + env = gym.wrappers.RecordVideo(env, config.recordings_path, trigger) + + # device, policy, optimizer + device = idist.device() + policy = Policy(env.observation_space.shape[0], env.action_space.n).to(device) + + optimizer = idist.auto_optim(optim.Adam(actor_critic.parameters(), lr=config.lr, betas=(0.9, 0.999))) + + # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + # self.to(self.device) + timesteps = range(10000) + + def run_single_timestep(engine, timestep): + observation = engine.state.observation + + # select action from the policy + observation = torch.Tensor(observation).to(device) + action = choose_action(policy, observation) + + engine.state.observation, reward, done, _, _ = env.step(action) + + if config.render: + env.render() + + policy.rewards.append(reward) + engine.state.ep_reward += reward + if done: + engine.terminate_epoch() + engine.state.timestep = timestep + + trainer = Engine(run_single_timestep) + trainer.state.running_reward = 10 + + @trainer.on(EPISODE_STARTED) + def reset_environment_state(): + # reset environment and episode reward + torch.manual_seed(config.seed + trainer.state.epoch) + trainer.state.observation, _ = env.reset(seed=config.seed + trainer.state.epoch) + trainer.state.ep_reward = 0 + + @trainer.on(EPISODE_COMPLETED) + def update_model(): + # update cumulative reward + trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward + # perform backprop + learn(policy, optimizer, config.gamma) + + @trainer.on(EPISODE_COMPLETED(every=config.log_every_episodes)) + def log_episode(): + i_episode = trainer.state.epoch + print( + f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}" + f"\tAverage reward: {trainer.state.running_reward:.2f}" + ) + + @trainer.on(EPISODE_COMPLETED) + def should_finish_training(): + # check if we have "solved" the cart pole problem + running_reward = trainer.state.running_reward + if running_reward > env.spec.reward_threshold: + print( + f"Solved! Running reward is now {running_reward} and " + f"the last episode runs to {trainer.state.timestep} time steps!" + ) + trainer.should_terminate = True + + trainer.run(timesteps, max_epochs=config.max_episodes) + + +def main(): + config = setup_config() + env = gym.make("CarRacing-v2", continuous=False, render_mode="rgb_array" if config.render else None) + with idist.Parallel(config.backend) as p: + p.run(run, env=env, config=config) + + +if __name__ == "__main__": + main() diff --git a/src/templates/template-reinforcement-learning/utils.py b/src/templates/template-reinforcement-learning/utils.py new file mode 100644 index 00000000..aec8eaac --- /dev/null +++ b/src/templates/template-reinforcement-learning/utils.py @@ -0,0 +1 @@ +#::= from_template_common ::# From 1c268b100b0224add4dc948e45809e89694a8daf Mon Sep 17 00:00:00 2001 From: Jyotirmay Khavasi Date: Thu, 27 Jul 2023 14:53:06 +0530 Subject: [PATCH 2/5] DQN CarRacing-v2 Template - DQN Template for CarRacing-v2 - Supports Rendering of video - Starts Giving Good results after ~100 Episodes --- .../DQN_CarRacing.py | 220 ++++++++++++++++++ .../dqn_config.yaml | 14 ++ 2 files changed, 234 insertions(+) create mode 100644 src/templates/template-reinforcement-learning/DQN_CarRacing.py create mode 100644 src/templates/template-reinforcement-learning/dqn_config.yaml diff --git a/src/templates/template-reinforcement-learning/DQN_CarRacing.py b/src/templates/template-reinforcement-learning/DQN_CarRacing.py new file mode 100644 index 00000000..c24c2365 --- /dev/null +++ b/src/templates/template-reinforcement-learning/DQN_CarRacing.py @@ -0,0 +1,220 @@ +import random +from collections import deque + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from utils import * + +from shutil import copy + +import ignite.distributed as idist + +from ignite.engine import Engine, Events + +from ignite.utils import manual_seed + +from utils import * + +try: + import gymnasium as gym +except ImportError: + raise ModuleNotFoundError("Please install opengym: pip install gymnasium[box2d]") + +import numpy as np + + +class DQNetwork(nn.Module): + def __init__(self, n_actions): + super(DQNetwork, self).__init__() + self.conv1 = nn.Conv2d(3, 8, kernel_size=4, stride=2) # 3 * 96 * 96 + self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=2) # 32 * 47 * 47 + self.conv3 = nn.Conv2d(16, 32, kernel_size=3, stride=2) # 64 * 22 * 22 + self.conv4 = nn.Conv2d(32, 64, kernel_size=3, stride=2) # 128 * 10 * 10 + self.conv5 = nn.Conv2d(64, 128, kernel_size=3, stride=1) # 256 * 6 * 6 + self.conv6 = nn.Conv2d(128, 256, kernel_size=3, stride=1) # 256 * 6 * 6 + + self.fc1 = nn.Linear(256, 100) + self.fc2 = nn.Linear(100, 100) + self.fc3 = nn.Linear(100, n_actions) + self.float() + + def forward(self, x): + x = torch.permute(x, (0, 3, 1, 2)) + + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.conv4(x)) + x = F.relu(self.conv5(x)) + x = F.relu(self.conv6(x)) + + x = x.reshape(-1, 256 * 1 * 1) + + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + + return self.fc3(x) + + +EPISODE_STARTED = Events.EPOCH_STARTED +EPISODE_COMPLETED = Events.EPOCH_COMPLETED + + +def choose_action(engine, policy, observation): + observation = observation.float().unsqueeze(0) + + policy.eval() + with torch.no_grad(): + action = policy(observation) + policy.train() + + sample = random.random() + if sample > engine.epsilon: + action = np.argmax(action.cpu().data.numpy()) + return action + else: + action = random.choice(np.arange(5)) + return action + + # return action + + +def learn(batch, batch_size, dqn, dqn_target, device, config, optimizer): + criterion = torch.nn.MSELoss() + + states = np.zeros((batch_size, 96, 96, 3)) + + next_states = np.zeros((batch_size, 96, 96, 3)) + + actions, rewards, dones = [], [], [] + + for i in range(batch_size): + state_i, action_i, reward_i, next_state_i, done_i = batch[i] + states[i] = state_i.cpu() + next_states[i] = next_state_i + actions.append(action_i) + rewards.append(reward_i) + dones.append(done_i) + + actions = np.vstack(actions).astype(int) + actions = torch.from_numpy(actions).to(device) + + rewards = np.vstack(rewards).astype(float) + rewards = torch.from_numpy(rewards).to(device) + + dones = np.vstack(dones).astype(int) + dones = torch.from_numpy(dones).to(device) + + dqn.train() + dqn_target.eval() + + predictions = dqn(torch.from_numpy(states).float().to(device)).gather(1, actions) + + with torch.no_grad(): + q_next = dqn_target(torch.from_numpy(next_states).float().to(device)).detach().max(1)[0].unsqueeze(1) + + targets = (rewards + (config.gamma * q_next * (1 - dones))).float() + + loss = criterion(predictions, targets).to(device) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def main(): + config = setup_config() + try: + env = gym.make("CarRacing-v2", continuous=False, render_mode="rgb_array" if config.render else None) + except ImportError: + raise ModuleNotFoundError("Please install the 2D env: pip install gymnasium[box2d]") + + # make seed + rank = idist.get_rank() + manual_seed(config.seed + rank) + + config.output_dir = setup_output_dir(config, rank) + if rank == 0: + copy(config.config, f"{config.output_dir}/config-lock.yaml") + + buffer = deque(maxlen=10000) + + # Create wrapper for saving video + if config.render: + + def trigger(episode): + return episode % config.save_every_episodes == 0 + + env = gym.wrappers.RecordVideo(env, config.recordings_path, trigger) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + dqn = DQNetwork(env.action_space.n).to(device) + dqn_target = DQNetwork(env.action_space.n).to(device) + + optimizer = optim.Adam(dqn.parameters(), lr=config.lr) + + rewards = [] + + def run_single_timestep(engine, timestep): + observation = torch.Tensor(engine.state.observation).to(device) + action = choose_action(engine, dqn, observation) + + next_observation, reward, done, _, _ = env.step(action) + + buffer.append((observation, action, reward, next_observation, done)) + + engine.state.ep_reward += reward + engine.state.next_observation = next_observation + + if done: + engine.terminate_epoch() + engine.state.timestep = timestep + + trainer = Engine(run_single_timestep) + + trainer.start_training = 3000 + trainer.epsilon = config.epsilon + trainer.epsilon_min = config.epsilon_min + trainer.epsilon_decay = config.epsilon_decay + trainer.batch_size = config.batch_size + + trainer.state.cumulative_reward = [] + + @trainer.on(Events.ITERATION_COMPLETED(every=500)) + def update_target_network(): + dqn_target.load_state_dict(dqn.state_dict()) + + @trainer.on(Events.ITERATION_COMPLETED(every=4)) + def perform_learning(): + if len(buffer) >= trainer.start_training: + if trainer.epsilon > trainer.epsilon_min: + trainer.epsilon *= trainer.epsilon_decay + minibatch = random.sample(buffer, min(len(buffer), trainer.batch_size)) + learn(minibatch, trainer.batch_size, dqn, dqn_target, device, config, optimizer) + + @trainer.on(Events.ITERATION_COMPLETED(every=1)) + def update_observations(): + trainer.state.observation = trainer.state.next_observation + + @trainer.on(EPISODE_STARTED) + def reset_environment_state(): + trainer.state.observation, _ = env.reset() + trainer.state.ep_reward = 0 + + @trainer.on(EPISODE_COMPLETED) + def update_rewards(): + rewards.append(trainer.state.ep_reward) + + @trainer.on(EPISODE_COMPLETED(every=config.log_every_episodes)) + def log_episode(): + print(f"Episode {trainer.state.epoch}; Reward for Episode: {trainer.state.ep_reward}, Current epsilon: {trainer.epsilon}") + + timesteps = range(config.steps_per_episode) + trainer.run(timesteps, max_epochs=config.max_episodes) + + +if __name__ == "__main__": + main() diff --git a/src/templates/template-reinforcement-learning/dqn_config.yaml b/src/templates/template-reinforcement-learning/dqn_config.yaml new file mode 100644 index 00000000..9a49144b --- /dev/null +++ b/src/templates/template-reinforcement-learning/dqn_config.yaml @@ -0,0 +1,14 @@ +seed: 666 +render: true +gamma: 0.975 +recordings_path: ./recordings +lr: 0.001 +max_episodes: 500 +log_every_episodes: 1 +save_every_episodes: 10 +output_dir: ./logs +epsilon: 1.0 +epsilon_min: 0.01 +epsilon_decay: 0.9999 +batch_size: 512 +steps_per_episode: 1500 \ No newline at end of file From 9490220e9148e5720001974a8e3cb3391e6929c5 Mon Sep 17 00:00:00 2001 From: Jyotirmay Khavasi Date: Fri, 25 Aug 2023 19:32:00 +0530 Subject: [PATCH 3/5] Modified RL Template - Uses Advantage Actor Critic model using torchrl - Gym Environment: CarRacing-v2 --- .../DQN_CarRacing.py | 220 ---------------- .../template-reinforcement-learning/README.md | 7 + .../template-reinforcement-learning/a2c.py | 109 ++++++++ .../a2c_model_env.py | 249 ++++++++++++++++++ .../advantage_actor_critic_a2c.py | 234 ---------------- .../config.yaml | 9 - .../config_a2c.yaml | 37 +++ .../dqn_config.yaml | 14 - .../reinforce.py | 199 -------------- .../requirements.txt | 11 + src/templates/templates.json | 8 + 11 files changed, 421 insertions(+), 676 deletions(-) delete mode 100644 src/templates/template-reinforcement-learning/DQN_CarRacing.py create mode 100644 src/templates/template-reinforcement-learning/README.md create mode 100644 src/templates/template-reinforcement-learning/a2c.py create mode 100644 src/templates/template-reinforcement-learning/a2c_model_env.py delete mode 100644 src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py delete mode 100644 src/templates/template-reinforcement-learning/config.yaml create mode 100644 src/templates/template-reinforcement-learning/config_a2c.yaml delete mode 100644 src/templates/template-reinforcement-learning/dqn_config.yaml delete mode 100644 src/templates/template-reinforcement-learning/reinforce.py create mode 100644 src/templates/template-reinforcement-learning/requirements.txt diff --git a/src/templates/template-reinforcement-learning/DQN_CarRacing.py b/src/templates/template-reinforcement-learning/DQN_CarRacing.py deleted file mode 100644 index c24c2365..00000000 --- a/src/templates/template-reinforcement-learning/DQN_CarRacing.py +++ /dev/null @@ -1,220 +0,0 @@ -import random -from collections import deque - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from utils import * - -from shutil import copy - -import ignite.distributed as idist - -from ignite.engine import Engine, Events - -from ignite.utils import manual_seed - -from utils import * - -try: - import gymnasium as gym -except ImportError: - raise ModuleNotFoundError("Please install opengym: pip install gymnasium[box2d]") - -import numpy as np - - -class DQNetwork(nn.Module): - def __init__(self, n_actions): - super(DQNetwork, self).__init__() - self.conv1 = nn.Conv2d(3, 8, kernel_size=4, stride=2) # 3 * 96 * 96 - self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=2) # 32 * 47 * 47 - self.conv3 = nn.Conv2d(16, 32, kernel_size=3, stride=2) # 64 * 22 * 22 - self.conv4 = nn.Conv2d(32, 64, kernel_size=3, stride=2) # 128 * 10 * 10 - self.conv5 = nn.Conv2d(64, 128, kernel_size=3, stride=1) # 256 * 6 * 6 - self.conv6 = nn.Conv2d(128, 256, kernel_size=3, stride=1) # 256 * 6 * 6 - - self.fc1 = nn.Linear(256, 100) - self.fc2 = nn.Linear(100, 100) - self.fc3 = nn.Linear(100, n_actions) - self.float() - - def forward(self, x): - x = torch.permute(x, (0, 3, 1, 2)) - - x = F.relu(self.conv1(x)) - x = F.relu(self.conv2(x)) - x = F.relu(self.conv3(x)) - x = F.relu(self.conv4(x)) - x = F.relu(self.conv5(x)) - x = F.relu(self.conv6(x)) - - x = x.reshape(-1, 256 * 1 * 1) - - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - - return self.fc3(x) - - -EPISODE_STARTED = Events.EPOCH_STARTED -EPISODE_COMPLETED = Events.EPOCH_COMPLETED - - -def choose_action(engine, policy, observation): - observation = observation.float().unsqueeze(0) - - policy.eval() - with torch.no_grad(): - action = policy(observation) - policy.train() - - sample = random.random() - if sample > engine.epsilon: - action = np.argmax(action.cpu().data.numpy()) - return action - else: - action = random.choice(np.arange(5)) - return action - - # return action - - -def learn(batch, batch_size, dqn, dqn_target, device, config, optimizer): - criterion = torch.nn.MSELoss() - - states = np.zeros((batch_size, 96, 96, 3)) - - next_states = np.zeros((batch_size, 96, 96, 3)) - - actions, rewards, dones = [], [], [] - - for i in range(batch_size): - state_i, action_i, reward_i, next_state_i, done_i = batch[i] - states[i] = state_i.cpu() - next_states[i] = next_state_i - actions.append(action_i) - rewards.append(reward_i) - dones.append(done_i) - - actions = np.vstack(actions).astype(int) - actions = torch.from_numpy(actions).to(device) - - rewards = np.vstack(rewards).astype(float) - rewards = torch.from_numpy(rewards).to(device) - - dones = np.vstack(dones).astype(int) - dones = torch.from_numpy(dones).to(device) - - dqn.train() - dqn_target.eval() - - predictions = dqn(torch.from_numpy(states).float().to(device)).gather(1, actions) - - with torch.no_grad(): - q_next = dqn_target(torch.from_numpy(next_states).float().to(device)).detach().max(1)[0].unsqueeze(1) - - targets = (rewards + (config.gamma * q_next * (1 - dones))).float() - - loss = criterion(predictions, targets).to(device) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - -def main(): - config = setup_config() - try: - env = gym.make("CarRacing-v2", continuous=False, render_mode="rgb_array" if config.render else None) - except ImportError: - raise ModuleNotFoundError("Please install the 2D env: pip install gymnasium[box2d]") - - # make seed - rank = idist.get_rank() - manual_seed(config.seed + rank) - - config.output_dir = setup_output_dir(config, rank) - if rank == 0: - copy(config.config, f"{config.output_dir}/config-lock.yaml") - - buffer = deque(maxlen=10000) - - # Create wrapper for saving video - if config.render: - - def trigger(episode): - return episode % config.save_every_episodes == 0 - - env = gym.wrappers.RecordVideo(env, config.recordings_path, trigger) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - dqn = DQNetwork(env.action_space.n).to(device) - dqn_target = DQNetwork(env.action_space.n).to(device) - - optimizer = optim.Adam(dqn.parameters(), lr=config.lr) - - rewards = [] - - def run_single_timestep(engine, timestep): - observation = torch.Tensor(engine.state.observation).to(device) - action = choose_action(engine, dqn, observation) - - next_observation, reward, done, _, _ = env.step(action) - - buffer.append((observation, action, reward, next_observation, done)) - - engine.state.ep_reward += reward - engine.state.next_observation = next_observation - - if done: - engine.terminate_epoch() - engine.state.timestep = timestep - - trainer = Engine(run_single_timestep) - - trainer.start_training = 3000 - trainer.epsilon = config.epsilon - trainer.epsilon_min = config.epsilon_min - trainer.epsilon_decay = config.epsilon_decay - trainer.batch_size = config.batch_size - - trainer.state.cumulative_reward = [] - - @trainer.on(Events.ITERATION_COMPLETED(every=500)) - def update_target_network(): - dqn_target.load_state_dict(dqn.state_dict()) - - @trainer.on(Events.ITERATION_COMPLETED(every=4)) - def perform_learning(): - if len(buffer) >= trainer.start_training: - if trainer.epsilon > trainer.epsilon_min: - trainer.epsilon *= trainer.epsilon_decay - minibatch = random.sample(buffer, min(len(buffer), trainer.batch_size)) - learn(minibatch, trainer.batch_size, dqn, dqn_target, device, config, optimizer) - - @trainer.on(Events.ITERATION_COMPLETED(every=1)) - def update_observations(): - trainer.state.observation = trainer.state.next_observation - - @trainer.on(EPISODE_STARTED) - def reset_environment_state(): - trainer.state.observation, _ = env.reset() - trainer.state.ep_reward = 0 - - @trainer.on(EPISODE_COMPLETED) - def update_rewards(): - rewards.append(trainer.state.ep_reward) - - @trainer.on(EPISODE_COMPLETED(every=config.log_every_episodes)) - def log_episode(): - print(f"Episode {trainer.state.epoch}; Reward for Episode: {trainer.state.ep_reward}, Current epsilon: {trainer.epsilon}") - - timesteps = range(config.steps_per_episode) - trainer.run(timesteps, max_epochs=config.max_episodes) - - -if __name__ == "__main__": - main() diff --git a/src/templates/template-reinforcement-learning/README.md b/src/templates/template-reinforcement-learning/README.md new file mode 100644 index 00000000..17945f75 --- /dev/null +++ b/src/templates/template-reinforcement-learning/README.md @@ -0,0 +1,7 @@ +[![Code-Generator](https://badgen.net/badge/Template%20by/Code-Generator/ee4c2c?labelColor=eaa700)](https://github.com/pytorch-ignite/code-generator) + +# Reinforcement Learning Template + +This is the Reinforcement Learning template by Code-Generator using OpenAI Gym for the environment CarRacing-v2. + +#::= from_template_common ::# diff --git a/src/templates/template-reinforcement-learning/a2c.py b/src/templates/template-reinforcement-learning/a2c.py new file mode 100644 index 00000000..39314980 --- /dev/null +++ b/src/templates/template-reinforcement-learning/a2c.py @@ -0,0 +1,109 @@ +from pprint import pformat +from shutil import copy +from typing import Any + +import ignite.distributed as idist +import torch +from ignite.engine import Events +from ignite.handlers import LRScheduler + +from ignite.utils import manual_seed + +from utils import * + +from a2c_model_env import make_a2c_models, make_collector, make_loss, make_optim, make_test_env + + +def main(): + config = setup_config() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + config.device = device + + rank = idist.get_rank() + manual_seed(config.seed + rank) + config.output_dir = setup_output_dir(config, rank) + if rank == 0: + copy(config.config, f"{config.output_dir}/config-lock.yaml") + + actor, critic = make_a2c_models(config) + actor = actor.to(device) + critic = critic.to(device) + + collector = make_collector(config, policy=actor) + loss_module, adv_module = make_loss(config, actor_network=actor, value_network=critic) + optim = make_optim(config, actor_network=actor, value_network=critic) + + batch_size = config.total_frames * config.num_envs + total_network_updates = config.total_frames // batch_size + + scheduler = None + if config.lr_scheduler: + scheduler = torch.optim.lr_scheduler.LinearLR(optim, total_iters=total_network_updates) + scheduler = LRScheduler(scheduler) + + test_env = make_test_env(config) + + def run_single_timestep(engine, _): + frames_in_batch = engine.state.data.numel() + trainer.state.collected_frames += frames_in_batch * config.frame_skip + data_view = engine.state.data.reshape(-1) + + with torch.no_grad(): + batch = adv_module(data_view) + + # Normalize advantage + adv = batch.get("advantage") + + # mean of the advantage values + loc = adv.mean().item() + # standard deviation of the advantage values + scale = adv.std().clamp_min(1e-6).item() + # normalizing the advantage values + adv = (adv - loc) / scale + batch.set("advantage", adv) + + # Forward pass A2C loss + batch = batch.to(device) + loss = loss_module(batch) + loss_sum = loss["loss_critic"] + loss["loss_objective"] + loss["loss_entropy"] + + # Backward pass + learning step + loss_sum.backward() + grad_norm = torch.nn.utils.clip_grad_norm_(list(actor.parameters()) + list(critic.parameters()), max_norm=0.5) + engine.state.metrics = { + "loss_sum": loss_sum.item(), + } + optim.step() + optim.zero_grad() + + trainer = Engine(run_single_timestep) + + logger = setup_logging(config) + logger.info("Configuration: \n%s", pformat(vars(config))) + trainer.logger = logger + + if config.lr_scheduler: + trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler) + + trainer.add_event_handler( + Events.ITERATION_COMPLETED(every=config.log_every_episodes), + log_metrics, + tag="train", + ) + + @trainer.on(Events.ITERATION_STARTED) + def update_data(): + # print(f"New iteration started") + trainer.state.data = next(iter(collector)) + trainer.state.collected_frames = 0 + + @trainer.on(Events.ITERATION_COMPLETED) + def log2(): + collector.update_policy_weights_() + + # timesteps = range(config.steps_per_episode) + trainer.run(epoch_length=int(config.total_frames / config.frames_per_batch), max_epochs=1) + + +if __name__ == "__main__": + main() diff --git a/src/templates/template-reinforcement-learning/a2c_model_env.py b/src/templates/template-reinforcement-learning/a2c_model_env.py new file mode 100644 index 00000000..afcdadeb --- /dev/null +++ b/src/templates/template-reinforcement-learning/a2c_model_env.py @@ -0,0 +1,249 @@ +import gymnasium as gym + +import torch + +import torch.nn +import torch.optim +from ignite.contrib.engines import common +from ignite.engine import Engine + +from ignite.engine.events import Events + +from ignite.utils import setup_logger + +from tensordict.nn import TensorDictModule + +from torchrl.collectors import SyncDataCollector +from torchrl.data import CompositeSpec + +from torchrl.data.tensor_specs import DiscreteBox +from torchrl.envs import ( + EnvCreator, + ExplorationType, + ObservationNorm, + ParallelEnv, + StepCounter, + ToTensorImage, + TransformedEnv, +) + +from torchrl.envs.libs.gym import GymWrapper +from torchrl.modules import ActorValueOperator, ConvNet, MLP, OneHotCategorical, ProbabilisticActor, ValueOperator +from torchrl.objectives import A2CLoss +from torchrl.objectives.value import GAE +from torchrl.objectives.value.advantages import GAE + +# from torchrl.trainers.helpers.envs import get_norm_state_dict + +from utils import * + + +def make_base_env(config): + env_kwargs = {"id": "CarRacing-v2", "continuous": False, "render_mode": "rgb_array"} + + env = gym.make(**env_kwargs) + + if config.render: + + def trigger_recording(episode): + return episode % config.save_every_episodes == 0 + + env = gym.wrappers.RecordVideo(env, config.recordings_path, episode_trigger=trigger_recording, video_length=0) + + env_kwargs2 = { + "device": config.device, + "from_pixels": True, + "pixels_only": True, + } + + env = GymWrapper(env, **env_kwargs2) + print("Base Env Created") + return env + + +def get_stats(config): + env = make_transformed_env_pixels(make_base_env(config), config) + return get_norm_state_dict(env) + + +def make_transformed_env_pixels(base_env, config): + env = TransformedEnv(base_env) + env.append_transform(ToTensorImage()) + env.append_transform(StepCounter()) + + return env + + +def make_parallel_env(config, state_dict): + num_envs = config.num_envs + env = make_transformed_env_pixels(ParallelEnv(num_envs, EnvCreator(lambda: make_base_env(config))), config) + for t in env.transform: + if isinstance(t, ObservationNorm): + t.init_stats(3, cat_dim=1, reduce_dim=[0, 1]) + env.load_state_dict(state_dict, strict=False) + return env + + +def make_a2c_models(config): + base_env = make_transformed_env_pixels(make_base_env(config), config) + + common_module, policy_module, value_module = make_a2c_models_pixels(base_env, config) + + actor_critic = ActorValueOperator( + common_operator=common_module, + policy_operator=policy_module, + value_operator=value_module, + ) + + actor = actor_critic.get_policy_operator() + critic = actor_critic.get_value_head() # to avoid + + with torch.no_grad(): + td = base_env.rollout(max_steps=100, break_when_any_done=False) + td = actor(td) + td = critic(td) + del td + + return actor, critic + + +def make_a2c_models_pixels(base_env, config): + env = base_env + + # define the input shape + input_shape = env.observation_spec["pixels"].shape + + # defining the distribution class and kwargs, in this case, the action space is DiscreteBox + if isinstance(env.action_spec.space, DiscreteBox): + num_outputs = env.action_spec.space.n + distribution_class = OneHotCategorical + distribution_kwargs = {} + + # Define the input keys + in_keys = ["pixels"] + + # Define a shared Module and TensorDictModule (CNN + MLP) + common_cnn = ConvNet( + activation_class=torch.nn.ReLU, + num_cells=[32, 64, 64], + kernel_sizes=[3, 1, 1], + strides=[2, 2, 1], + device=config.device, + ) + common_cnn_output = common_cnn(torch.ones(input_shape).to(config.device)) + common_mlp = MLP( + in_features=common_cnn_output.shape[-1], + activation_class=torch.nn.ReLU, + activate_last_layer=True, + out_features=512, + num_cells=[], + device=config.device, + ) + common_mlp_output = common_mlp(common_cnn_output).to(config.device) + + # Define shared net as TensorDictModule + common_module = TensorDictModule( + module=torch.nn.Sequential(common_cnn, common_mlp), + in_keys=in_keys, + out_keys=["common_features"], + ) + + # Define on head for the policy + policy_net = MLP( + in_features=common_mlp_output.shape[-1], + out_features=num_outputs, + num_cells=[256], + device=config.device, + ) + policy_module = TensorDictModule( + module=policy_net, + in_keys=["common_features"], + out_keys=["logits"], + ) + + # Add probabilistic sampling of the actions + policy_module = ProbabilisticActor( + policy_module, + in_keys=["logits"], + spec=CompositeSpec(action=env.action_spec), + safe=True, + distribution_class=distribution_class, + distribution_kwargs=distribution_kwargs, + return_log_prob=True, + default_interaction_type=ExplorationType.RANDOM, + ) + + # Define another head for the value + value_net = MLP( + in_features=common_mlp_output.shape[-1], + out_features=1, + num_cells=[256], + device=config.device, + ) + value_module = ValueOperator( + value_net, + in_keys=["common_features"], + ) + + return common_module, policy_module, value_module + + +def make_collector(config, policy): + collector_class = SyncDataCollector + state_dict = get_stats(config) + collector = collector_class( + make_parallel_env(config, state_dict), + policy=policy, + frames_per_batch=config.frames_per_batch, + total_frames=config.total_frames, + device=config.device, + max_frames_per_traj=config.max_frames_per_traj, + ) + return collector + + +def make_advantage_module(config, value_network): + advantage_module = GAE( + gamma=config.gamma, + lmbda=config.gae_lambda, + value_network=value_network, + average_gae=True, + ) + return advantage_module + + +def make_test_env(config): + num_envs = 1 + state_dict = get_stats(config) + env = make_parallel_env(config, state_dict) + return env + + +def make_loss(config, actor_network, value_network): + advantage_module = make_advantage_module(config, value_network) + loss_module = A2CLoss( + actor=actor_network, + critic=value_network, + loss_critic_type=config.loss_critic_type, + entropy_coef=config.entropy_coef, + critic_coef=config.critic_coef, + entropy_bonus=True, + ) + loss_module.make_value_estimator(gamma=config.gamma) + return loss_module, advantage_module + + +def make_optim(config, actor_network, value_network): + optim = torch.optim.Adam( + list(actor_network.parameters()) + list(value_network.parameters()), + lr=config.lr, + weight_decay=config.weight_decay, + ) + return optim + + +def get_norm_state_dict(env): + """Gets the normalization loc and scale from the env state_dict.""" + sd = env.state_dict() + sd = {key: val for key, val in sd.items() if key.endswith("loc") or key.endswith("scale")} + return sd diff --git a/src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py b/src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py deleted file mode 100644 index e817bbab..00000000 --- a/src/templates/template-reinforcement-learning/advantage_actor_critic_a2c.py +++ /dev/null @@ -1,234 +0,0 @@ -from collections import deque, namedtuple - -from shutil import copy - -import ignite.distributed as idist -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim - -from ignite.engine import Engine, Events - -from ignite.utils import manual_seed - -from torch.distributions import Categorical - -from utils import * - -from typing import Any - -import numpy as np - -# from matplotlib import pyplot as plt - -try: - import gymnasium as gym -except ImportError: - raise ModuleNotFoundError("Please install opengym: pip install gymnasium[box2d]") - -SavedAction = namedtuple("SavedAction", ["log_prob", "value"]) - -eps = np.finfo(np.float32).eps.item() - - -class ActorCriticNetwork(nn.Module): - def __init__(self, n_actions): - super(ActorCriticNetwork, self).__init__() - self.LeakyReLU = nn.LeakyReLU() - self.Sigmoid = nn.Sigmoid() - self.Softplus = nn.Softplus() - - # REVIEW: - # OPTIMIZE: - self.conv1 = nn.Conv2d(3, 8, kernel_size=7, stride=4, padding=0) - self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=2) - self.pool = nn.MaxPool2d(kernel_size=2, stride=2) - - self.fc1 = nn.Linear(576, 512) - self.fc_critic2 = nn.Linear(512, 1) - self.fc_actor2 = nn.Linear(512, 256) - self.fc_actor3 = nn.Linear(256, n_actions) - - self.flatten = nn.Flatten() - - self.saved_actions = [] - self.rewards = [] - self.saved_log_probs = [] - - def forward(self, observation): - # state = torch.Tensor(observation).to(self.device) - - # Shared weights - x = self.LeakyReLU(self.conv1(observation)) - x = self.pool(x) - x = self.LeakyReLU(self.conv2(x)) - x = self.pool(x) - x = self.flatten(x) - x = self.fc1(x) - - # actor and critic - # actor - dist = self.LeakyReLU(self.fc_actor2(x)) - dist = self.Softplus(self.fc_actor3(dist)) - - actor = F.softmax(dist, dim=1) - - # critic - critic = self.fc_critic2(x) - - return actor, critic - - -# choose an action for the discrete actions -def choose_action(policy, observation): - observation = observation.float().unsqueeze(0) - state = torch.transpose(observation, 1, 3) - probabilities, value = policy(state) - # probabilities = F.softmax(probabilities) - - action_probs = Categorical(probabilities) - action = action_probs.sample() - - log_probs = action_probs.log_prob(action) - policy.saved_actions.append(SavedAction(log_probs, value)) - policy.saved_log_probs.append(log_probs) - - return action.item() - - -def learn(policy, optimizer, gamma): - R = 0 - saved_actions = policy.saved_actions - policy_losses = [] # list to save actor (policy) loss - value_losses = [] # list to save critic (value) loss - returns = deque() # list to save the true values - - for r in policy.rewards[::-1]: - # calculate the discounted value - R = r + gamma * R - returns.appendleft(R) - - returns = torch.tensor(returns) - returns = (returns - returns.mean()) / (returns.std() + eps) - - for (log_prob, value), R in zip(saved_actions, returns): - advantage = R - value.item() - - # calculate actor (policy) loss - policy_losses.append(-log_prob * advantage) - - # calculate critic (value) loss using L1 smooth loss - value_losses.append(F.smooth_l1_loss(value, torch.tensor([R]))) - - # reset gradients - optimizer.zero_grad() - - # sum up all the values of policy_losses and value_losses - loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum() - - # perform backprop - loss.backward() - optimizer.step() - # reset rewards and action buffer - del policy.rewards[:] - del policy.saved_actions[:] - - -EPISODE_STARTED = Events.EPOCH_STARTED -EPISODE_COMPLETED = Events.EPOCH_COMPLETED - - -def run(local_rank: int, env: Any, config: Any): - # make seed - rank = idist.get_rank() - manual_seed(config.seed + rank) - - # create output folder and copy config file to ouput dir - config.output_dir = setup_output_dir(config, rank) - if rank == 0: - copy(config.config, f"{config.output_dir}/config-lock.yaml") - - # create wrapper for saving video - if config.render: - - def trigger(episode): - return episode % config.save_every_episodes == 0 - - env = gym.wrappers.RecordVideo(env, config.recordings_path, trigger) - - # device, policy, optimizer - device = idist.device() - actor_critic = ActorCriticNetwork(env.action_space.n).to(device) - optimizer = idist.auto_optim(optim.Adam(actor_critic.parameters(), lr=config.lr, betas=(0.9, 0.999))) - - # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - # self.to(self.device) - timesteps = range(10000) - - def run_single_timestep(engine, timestep): - observation = engine.state.observation - - # select action from the policy - observation = torch.Tensor(observation).to(device) - action = choose_action(actor_critic, observation) - - engine.state.observation, reward, done, _, _ = env.step(action) - - if config.render: - env.render() - - actor_critic.rewards.append(reward) - engine.state.ep_reward += reward - if done: - engine.terminate_epoch() - engine.state.timestep = timestep - - trainer = Engine(run_single_timestep) - trainer.state.running_reward = 10 - - @trainer.on(EPISODE_STARTED) - def reset_environment_state(): - # reset environment and episode reward - torch.manual_seed(config.seed + trainer.state.epoch) - trainer.state.observation, _ = env.reset(seed=config.seed + trainer.state.epoch) - trainer.state.ep_reward = 0 - - @trainer.on(EPISODE_COMPLETED) - def update_model(): - # update cumulative reward - trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward - # perform backprop - learn(actor_critic, optimizer, config.gamma) - - @trainer.on(EPISODE_COMPLETED(every=config.log_every_episodes)) - def log_episode(): - i_episode = trainer.state.epoch - print( - f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}" - f"\tAverage reward: {trainer.state.running_reward:.2f}" - ) - - @trainer.on(EPISODE_COMPLETED) - def should_finish_training(): - # check if we have "solved" the cart pole problem - running_reward = trainer.state.running_reward - if running_reward > env.spec.reward_threshold: - print( - f"Solved! Running reward is now {running_reward} and " - f"the last episode runs to {trainer.state.timestep} time steps!" - ) - trainer.should_terminate = True - - trainer.run(timesteps, max_epochs=config.max_episodes) - - -def main(): - config = setup_config() - env = gym.make("CarRacing-v2", continuous=False, render_mode="rgb_array" if config.render else None) - with idist.Parallel(config.backend) as p: - p.run(run, env=env, config=config) - - -if __name__ == "__main__": - main() diff --git a/src/templates/template-reinforcement-learning/config.yaml b/src/templates/template-reinforcement-learning/config.yaml deleted file mode 100644 index f0ce3224..00000000 --- a/src/templates/template-reinforcement-learning/config.yaml +++ /dev/null @@ -1,9 +0,0 @@ -seed: 666 -render: true -gamma: 0.99 -recordings_path: ./recordings -lr: 0.0003 -max_episodes: 10000 -log_every_episodes: 1 -save_every_episodes: 10 -output_dir: ./logs diff --git a/src/templates/template-reinforcement-learning/config_a2c.yaml b/src/templates/template-reinforcement-learning/config_a2c.yaml new file mode 100644 index 00000000..aceb6e2b --- /dev/null +++ b/src/templates/template-reinforcement-learning/config_a2c.yaml @@ -0,0 +1,37 @@ +# task and env +frame_skip: 2 +num_envs: 1 +reward_scaling: 1.0 +from_pixels: True +render_mode: rgb_array +continuous: False +pixels_only: True + +# collector: +frames_per_batch: 64 +total_frames: 1_000_000 +max_frames_per_traj: -1 + +# logger: +log_interval: 10000 + +# optim: +lr: 0.0005 +weight_decay: 0.0 +lr_scheduler: True + +# loss: +gamma: 0.99 +gae_lambda: 0.95 +critic_coef: 0.5 +entropy_coef: 0.01 +loss_critic_type: l2 + +seed: 666 +render: true +recordings_path: ./recordings +max_episodes: 10000 +log_every_episodes: 50 +save_every_episodes: 50 +output_dir: ./logs +debug: false diff --git a/src/templates/template-reinforcement-learning/dqn_config.yaml b/src/templates/template-reinforcement-learning/dqn_config.yaml deleted file mode 100644 index 9a49144b..00000000 --- a/src/templates/template-reinforcement-learning/dqn_config.yaml +++ /dev/null @@ -1,14 +0,0 @@ -seed: 666 -render: true -gamma: 0.975 -recordings_path: ./recordings -lr: 0.001 -max_episodes: 500 -log_every_episodes: 1 -save_every_episodes: 10 -output_dir: ./logs -epsilon: 1.0 -epsilon_min: 0.01 -epsilon_decay: 0.9999 -batch_size: 512 -steps_per_episode: 1500 \ No newline at end of file diff --git a/src/templates/template-reinforcement-learning/reinforce.py b/src/templates/template-reinforcement-learning/reinforce.py deleted file mode 100644 index 2df96153..00000000 --- a/src/templates/template-reinforcement-learning/reinforce.py +++ /dev/null @@ -1,199 +0,0 @@ -from collections import deque - -from shutil import copy - -import ignite.distributed as idist -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim - -from ignite.engine import Engine, Events - -from ignite.utils import manual_seed - -from torch.distributions import Categorical - -from utils import * - -from typing import Any - -import numpy as np - -try: - import gymnasium as gym -except ImportError: - raise ModuleNotFoundError("Please install opengym: pip install gymnasium[box2d]") - - -eps = np.finfo(np.float32).eps.item() - - -class Policy(nn.Module): - def __init__(self, state_dim, output_actions) -> None: - super(Policy, self).__init__() - - self.conv = nn.Sequential( - nn.Conv2d(state_dim, 32, kernel_size=3, stride=4), - nn.ReLU(), - nn.Conv2d(32, 64, kernel_size=1, stride=2), - nn.ReLU(), - nn.Conv2d(64, 64, kernel_size=1, stride=1), - nn.ReLU(), - nn.Flatten(), - ) - - conv_out_size = self._get_conv_out(state_dim) - - self.fc1 = nn.Linear(conv_out_size, 512) - self.fc2 = nn.Linear(512, 128) - self.fc3 = nn.Linear(128, output_actions) - - self.relu = nn.ReLU() - - self.saved_log_probs = [] - self.rewards = [] - - def forward(self, x): - x = self.conv(x) - # x = self.dp(x) - x = self.fc1(x) - x = self.relu(x) - x = self.fc2(x) - x = self.relu(x) - action_scores = self.fc3(x) - return F.softmax(action_scores, dim=1) - - def _get_conv_out(self, shape): - x = torch.zeros(1, *shape) - x = self.conv(x) - - return int(np.prod(x.size())) - - -def choose_action(policy, observation): - state = torch.from_numpy(observation).float().unsqueeze(0) - probs = policy(state) - m = Categorical(probs) - action = m.sample() - policy.saved_log_probs.append(m.log_prob(action)) - return action.item() - - -def learn(policy, optimizer, gamma): - R = 0 - policy_loss = [] - returns = deque() - for r in policy.rewards[::-1]: - R = r + gamma * R - returns.appendleft(R) - returns = torch.tensor(returns) - returns = (returns - returns.mean()) / (returns.std() + eps) - for log_prob, R in zip(policy.saved_log_probs, returns): - policy_loss.append(-log_prob * R) - optimizer.zero_grad() - policy_loss = torch.cat(policy_loss).sum() - policy_loss.backward() - optimizer.step() - del policy.rewards[:] - del policy.saved_log_probs[:] - - -EPISODE_STARTED = Events.EPOCH_STARTED -EPISODE_COMPLETED = Events.EPOCH_COMPLETED - - -def run(local_rank: int, env: Any, config: Any): - # make seed - rank = idist.get_rank() - manual_seed(config.seed + rank) - - # create output folder and copy config file to output dir - config.output_dir = setup_output_dir(config, rank) - if rank == 0: - copy(config.config, f"{config.output_dir}/config-lock.yaml") - - # create wrapper for saving video - if config.render: - - def trigger(episode): - return episode % config.save_every_episode == 0 - - env = gym.wrappers.RecordVideo(env, config.recordings_path, trigger) - - # device, policy, optimizer - device = idist.device() - policy = Policy(env.observation_space.shape[0], env.action_space.n).to(device) - - optimizer = idist.auto_optim(optim.Adam(actor_critic.parameters(), lr=config.lr, betas=(0.9, 0.999))) - - # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - # self.to(self.device) - timesteps = range(10000) - - def run_single_timestep(engine, timestep): - observation = engine.state.observation - - # select action from the policy - observation = torch.Tensor(observation).to(device) - action = choose_action(policy, observation) - - engine.state.observation, reward, done, _, _ = env.step(action) - - if config.render: - env.render() - - policy.rewards.append(reward) - engine.state.ep_reward += reward - if done: - engine.terminate_epoch() - engine.state.timestep = timestep - - trainer = Engine(run_single_timestep) - trainer.state.running_reward = 10 - - @trainer.on(EPISODE_STARTED) - def reset_environment_state(): - # reset environment and episode reward - torch.manual_seed(config.seed + trainer.state.epoch) - trainer.state.observation, _ = env.reset(seed=config.seed + trainer.state.epoch) - trainer.state.ep_reward = 0 - - @trainer.on(EPISODE_COMPLETED) - def update_model(): - # update cumulative reward - trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward - # perform backprop - learn(policy, optimizer, config.gamma) - - @trainer.on(EPISODE_COMPLETED(every=config.log_every_episodes)) - def log_episode(): - i_episode = trainer.state.epoch - print( - f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}" - f"\tAverage reward: {trainer.state.running_reward:.2f}" - ) - - @trainer.on(EPISODE_COMPLETED) - def should_finish_training(): - # check if we have "solved" the cart pole problem - running_reward = trainer.state.running_reward - if running_reward > env.spec.reward_threshold: - print( - f"Solved! Running reward is now {running_reward} and " - f"the last episode runs to {trainer.state.timestep} time steps!" - ) - trainer.should_terminate = True - - trainer.run(timesteps, max_epochs=config.max_episodes) - - -def main(): - config = setup_config() - env = gym.make("CarRacing-v2", continuous=False, render_mode="rgb_array" if config.render else None) - with idist.Parallel(config.backend) as p: - p.run(run, env=env, config=config) - - -if __name__ == "__main__": - main() diff --git a/src/templates/template-reinforcement-learning/requirements.txt b/src/templates/template-reinforcement-learning/requirements.txt new file mode 100644 index 00000000..a4380b48 --- /dev/null +++ b/src/templates/template-reinforcement-learning/requirements.txt @@ -0,0 +1,11 @@ +#::= from_template_common ::# + +torchrl +swig +gymnasium[box2d] +moviepy + +# for colab: +# !pip install -q swig +# !pip install gymnasium[box2d] +# !pip install torchrl \ No newline at end of file diff --git a/src/templates/templates.json b/src/templates/templates.json index 47b4257a..2690ac9d 100644 --- a/src/templates/templates.json +++ b/src/templates/templates.json @@ -43,5 +43,13 @@ "utils.py", "test_all.py", "requirements.txt" + ], + "template-reinforcement-learning": [ + "README.md", + "config_a2c.yaml", + "a2c_model_env.py", + "a2c.py", + "utils.py", + "requirements.txt" ] } From ef1a873b1088c0701c7d94ee41c1db63a44c7f2e Mon Sep 17 00:00:00 2001 From: Jyotirmay Khavasi Date: Fri, 25 Aug 2023 19:55:21 +0530 Subject: [PATCH 4/5] Update README and requirements --- .../template-reinforcement-learning/README.md | 30 ++++++++++++++++++- .../requirements.txt | 8 +---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/templates/template-reinforcement-learning/README.md b/src/templates/template-reinforcement-learning/README.md index 17945f75..ab78fd32 100644 --- a/src/templates/template-reinforcement-learning/README.md +++ b/src/templates/template-reinforcement-learning/README.md @@ -4,4 +4,32 @@ This is the Reinforcement Learning template by Code-Generator using OpenAI Gym for the environment CarRacing-v2. -#::= from_template_common ::# +## Getting Started + +Install the dependencies with `pip`: + +```sh +pip install -r requirements.txt --progress-bar off -U +``` + +### Code structure + +``` +| +|- README.md +| +|- a2c.py : main script to run +|- a2c_model_env.py : Utility functions for the reinforcement learning template for various tasks +|- utils.py : module with various helper functions +|- requirements.txt : dependencies to install with pip +| +|- config_a2c.yaml : global configuration YAML file +``` + +## Training + +### 1 GPU Training + +```sh +python a2c.py config_a2c.yaml +``` diff --git a/src/templates/template-reinforcement-learning/requirements.txt b/src/templates/template-reinforcement-learning/requirements.txt index a4380b48..e7650b4c 100644 --- a/src/templates/template-reinforcement-learning/requirements.txt +++ b/src/templates/template-reinforcement-learning/requirements.txt @@ -1,11 +1,5 @@ #::= from_template_common ::# - torchrl swig gymnasium[box2d] -moviepy - -# for colab: -# !pip install -q swig -# !pip install gymnasium[box2d] -# !pip install torchrl \ No newline at end of file +moviepy \ No newline at end of file From 7619b245859037d101d60cba57eee0cdc703a747 Mon Sep 17 00:00:00 2001 From: Jyotirmay Khavasi Date: Fri, 25 Aug 2023 20:41:42 +0530 Subject: [PATCH 5/5] Modify in the colab, remove from requirements --- functions/colab.js | 7 +++++++ src/templates/template-reinforcement-learning/a2c.py | 4 ++-- .../template-reinforcement-learning/requirements.txt | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/functions/colab.js b/functions/colab.js index 193d5b3f..9fd88551 100644 --- a/functions/colab.js +++ b/functions/colab.js @@ -37,6 +37,13 @@ exports.handler = async function (event, _) { ) } + if (title === 'Template Reinforcement Learning') { + specific_commands.push( + '!pip install swig\n', + '!pip install gymnasium[box2d]' + ) + } + const md_cell = [ `# ${title} by PyTorch-Ignite Code-Generator\n\n`, 'Please, run the cell below to execute your code.' diff --git a/src/templates/template-reinforcement-learning/a2c.py b/src/templates/template-reinforcement-learning/a2c.py index 39314980..13710548 100644 --- a/src/templates/template-reinforcement-learning/a2c.py +++ b/src/templates/template-reinforcement-learning/a2c.py @@ -17,13 +17,13 @@ def main(): config = setup_config() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - config.device = device + config.device = f"{device}" rank = idist.get_rank() manual_seed(config.seed + rank) config.output_dir = setup_output_dir(config, rank) if rank == 0: - copy(config.config, f"{config.output_dir}/config-lock.yaml") + save_config(config, config.output_dir) actor, critic = make_a2c_models(config) actor = actor.to(device) diff --git a/src/templates/template-reinforcement-learning/requirements.txt b/src/templates/template-reinforcement-learning/requirements.txt index e7650b4c..af9d9f2f 100644 --- a/src/templates/template-reinforcement-learning/requirements.txt +++ b/src/templates/template-reinforcement-learning/requirements.txt @@ -1,5 +1,3 @@ #::= from_template_common ::# torchrl -swig -gymnasium[box2d] moviepy \ No newline at end of file