OpenRL-Lab · huangshiyu13 · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/Gallery.md b/Gallery.md
@@ -13,6 +13,12 @@ Users are also welcome to contribute their own training examples and demos to th
 ![offline](https://img.shields.io/badge/-offlineRL-darkblue)
 ![selfplay](https://img.shields.io/badge/-selfplay-blue)
 ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue)
+![image](https://img.shields.io/badge/-image-red)
+
+![value](https://img.shields.io/badge/-value-orange) (Value-based RL)
+
+![offpolicy](https://img.shields.io/badge/-offpolicy-blue) (Off-policy RL)
+
 
 ![discrete](https://img.shields.io/badge/-discrete-brightgreen) (Discrete Action Space)
 
@@ -26,14 +32,15 @@ Users are also welcome to contribute their own training examples and demos to th
 
 <div align="center">
 
-|                     Algorithm                     |                                                          Tags                                                           |              Refs               |
-|:-------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------:|:-------------------------------:|
-|      [PPO](https://arxiv.org/abs/1707.06347)      |                             ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                             |  [code](./examples/cartpole/)   |
-|     [PPO-continuous](https://arxiv.org/abs/1707.06347)      |                             ![continuous](https://img.shields.io/badge/-continous-green)                             |  [code](./examples/mujoco/)    |
-| [Dual-clip PPO](https://arxiv.org/abs/1912.09729) |                             ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                             |  [code](./examples/cartpole/)   |
-|     [MAPPO](https://arxiv.org/abs/2103.01955)     |                             ![MARL](https://img.shields.io/badge/-MARL-yellow)                             |  [code](./examples/mpe/)   |
-|     [JRPO](https://arxiv.org/abs/2302.07515)      |                             ![MARL](https://img.shields.io/badge/-MARL-yellow)                             |  [code](./examples/mpe/)   |
-|      [MAT](https://arxiv.org/abs/2205.14953)      |                             ![MARL](https://img.shields.io/badge/-MARL-yellow)  ![Transformer](https://img.shields.io/badge/-Transformer-blue)                           |  [code](./examples/mpe/)   |
+|                     Algorithm                      |                                                          Tags                                                           |             Refs              |
+|:--------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------:|:-----------------------------:|
+|      [PPO](https://arxiv.org/abs/1707.06347)       |                             ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                             | [code](./examples/cartpole/)  |
+| [PPO-continuous](https://arxiv.org/abs/1707.06347) |                             ![continuous](https://img.shields.io/badge/-continous-green)                             |  [code](./examples/mujoco/)   |
+| [Dual-clip PPO](https://arxiv.org/abs/1912.09729)  |                             ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                             | [code](./examples/cartpole/)  |
+|     [MAPPO](https://arxiv.org/abs/2103.01955)      |                             ![MARL](https://img.shields.io/badge/-MARL-yellow)                             |    [code](./examples/mpe/)    |
+|      [JRPO](https://arxiv.org/abs/2302.07515)      |                             ![MARL](https://img.shields.io/badge/-MARL-yellow)                             |    [code](./examples/mpe/)    |
+|      [DQN](https://arxiv.org/abs/1312.5602)       |                             ![discrete](https://img.shields.io/badge/-discrete-brightgreen)   ![value](https://img.shields.io/badge/-value-orange)   ![offpolicy](https://img.shields.io/badge/-offpolicy-blue)                       | [code](./examples/gridworld/) |
+|      [MAT](https://arxiv.org/abs/2205.14953)       |                             ![MARL](https://img.shields.io/badge/-MARL-yellow)  ![Transformer](https://img.shields.io/badge/-Transformer-blue)                           |    [code](./examples/mpe/)    |
 </div>
 
 ## Demo List
@@ -46,6 +53,8 @@ Users are also welcome to contribute their own training examples and demos to th
 |                               [CartPole](https://gymnasium.farama.org/environments/classic_control/cart_pole/)<br>  <img width="300px" height="auto" src="./docs/images/cartpole.png">                                |                           ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                           |  [code](./examples/cartpole/)   |
 |                       [MPE: Simple Spread](https://pettingzoo.farama.org/environments/mpe/simple_spread/)<br>  <img width="300px" height="auto" src="./docs/images/simple_spread_trained.gif">                        | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)  ![MARL](https://img.shields.io/badge/-MARL-yellow) |     [code](./examples/mpe/)     |
 |                                [Chat Bot](https://openrl-docs.readthedocs.io/en/latest/quick_start/train_nlp.html)<br>  <img width="300px" height="auto" src="./docs/images/chat.gif">                                |                          ![discrete](https://img.shields.io/badge/-discrete-brightgreen)        ![NLP](https://img.shields.io/badge/-NLP-green)     ![Transformer](https://img.shields.io/badge/-Transformer-blue)                               |     [code](./examples/nlp/)     |
-| [Super Mario Bros](https://github.com/Kautenja/gym-super-mario-bros)<br>  <img width="300px" height="auto" src="https://user-images.githubusercontent.com/2184469/40948820-3d15e5c2-6830-11e8-81d4-ecfaffee0a14.png"> |                           ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                           | [code](./examples/super_mario/) |
-|                                                 [Gym Retro](https://github.com/openai/retro)<br>  <img width="300px" height="auto" src="./docs/images/gym-retro.jpg">                                                 |                           ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                           |    [code](./examples/retro/)    |
+|                                        [Atari Pong](https://gymnasium.farama.org/environments/atari/pong/)<br>  <img width="300px" height="auto" src="./docs/images/pong.png">                                        |                          ![discrete](https://img.shields.io/badge/-discrete-brightgreen)        ![image](https://img.shields.io/badge/-image-red)                                    |    [code](./examples/atari/)    |
+|                                                      [GridWorld](./examples/gridworld/)<br>  <img width="300px" height="auto" src="./docs/images/gridworld.jpg">                                                      |                          ![discrete](https://img.shields.io/badge/-discrete-brightgreen)                               |  [code](./examples/gridworld/)  |
+| [Super Mario Bros](https://github.com/Kautenja/gym-super-mario-bros)<br>  <img width="300px" height="auto" src="https://user-images.githubusercontent.com/2184469/40948820-3d15e5c2-6830-11e8-81d4-ecfaffee0a14.png"> |                           ![discrete](https://img.shields.io/badge/-discrete-brightgreen)     ![image](https://img.shields.io/badge/-image-red)                      | [code](./examples/super_mario/) |
+|                                                 [Gym Retro](https://github.com/openai/retro)<br>  <img width="300px" height="auto" src="./docs/images/gym-retro.jpg">                                                 |                           ![discrete](https://img.shields.io/badge/-discrete-brightgreen)     ![image](https://img.shields.io/badge/-image-red)                      |    [code](./examples/retro/)    |
 </div>
diff --git a/README.md b/README.md
@@ -74,13 +74,16 @@ Algorithms currently supported by OpenRL (for more details, please refer to [Gal
 - [Dual-clip PPO](https://arxiv.org/abs/1912.09729)
 - [Multi-agent PPO (MAPPO)](https://arxiv.org/abs/2103.01955)
 - [Joint-ratio Policy Optimization (JRPO)](https://arxiv.org/abs/2302.07515)
+- [Deep Q-Network (DQN)](https://arxiv.org/abs/1312.5602)
 - [Multi-Agent Transformer (MAT)](https://arxiv.org/abs/2205.14953)
 
 Environments currently supported by OpenRL (for more details, please refer to [Gallery](./Gallery.md)):
 - [Gymnasium](https://gymnasium.farama.org/)
 - [MuJoCo](https://github.com/deepmind/mujoco)
 - [MPE](https://github.com/openai/multiagent-particle-envs)
 - [Chat Bot](https://openrl-docs.readthedocs.io/en/latest/quick_start/train_nlp.html)
+- [Atari](https://gymnasium.farama.org/environments/atari/)
+- [GridWorld](./examples/gridworld/)
 - [Super Mario Bros](https://github.com/Kautenja/gym-super-mario-bros)
 - [Gym Retro](https://github.com/openai/retro)
 

diff --git a/README_zh.md b/README_zh.md
@@ -59,13 +59,16 @@ OpenRL目前支持的算法（更多详情请参考 [Gallery](Gallery.md)）：
 - [Dual-clip PPO](https://arxiv.org/abs/1912.09729)
 - [Multi-agent PPO (MAPPO)](https://arxiv.org/abs/2103.01955)
 - [Joint-ratio Policy Optimization (JRPO)](https://arxiv.org/abs/2302.07515)
+- [Deep Q-Network (DQN)](https://arxiv.org/abs/1312.5602)
 - [Multi-Agent Transformer (MAT)](https://arxiv.org/abs/2205.14953)
 
 OpenRL目前支持的环境（更多详情请参考 [Gallery](Gallery.md)）：
 - [Gymnasium](https://gymnasium.farama.org/)
 - [MuJoCo](https://github.com/deepmind/mujoco)
 - [MPE](https://github.com/openai/multiagent-particle-envs)
 - [Chat Bot](https://openrl-docs.readthedocs.io/en/latest/quick_start/train_nlp.html)
+- [Atari](https://gymnasium.farama.org/environments/atari/)
+- [GridWorld](./examples/gridworld/)
 - [Super Mario Bros](https://github.com/Kautenja/gym-super-mario-bros)
 - [Gym Retro](https://github.com/openai/retro)
 

diff --git a/docs/images/gridworld.jpg b/docs/images/gridworld.jpg
diff --git a/docs/images/pong.png b/docs/images/pong.png
diff --git a/examples/atari/README.md b/examples/atari/README.md
@@ -0,0 +1,19 @@
+## Installation
+
+`pip install "gymnasium[atari]"`
+
+Then install auto-rom via:
+`pip install "gymnasium[accept-rom-license]"`
+
+or:
+```shell
+pip install autorom
+
+AutoROM --accept-license
+```
+
+## Usage
+
+```shell
+python train_ppo.py --config atari_ppo.yaml
+```
diff --git a/examples/atari/atari_ppo.yaml b/examples/atari/atari_ppo.yaml
@@ -0,0 +1,23 @@
+seed: 0
+lr: 2.5e-4
+critic_lr: 2.5e-4
+episode_length: 128
+ppo_epoch: 4
+gain: 0.01
+use_linear_lr_decay: true
+use_share_model: true
+entropy_coef: 0.01
+hidden_size: 512
+num_mini_batch: 4
+clip_param: 0.1
+value_loss_coef: 0.5
+run_dir: ./run_results/
+experiment_name: atari_ppo
+log_interval: 1
+use_recurrent_policy: false
+use_valuenorm: true
+use_adv_normalize: true
+wandb_entity: openrl-lab
+
+vec_info_class:
+  id: "EPS_RewardInfo"
diff --git a/examples/atari/train_ppo.py b/examples/atari/train_ppo.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2023 The OpenRL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""""""
+import numpy as np
+
+from openrl.configs.config import create_config_parser
+from openrl.envs.common import make
+from openrl.envs.wrappers.atari_wrappers import (
+    ClipRewardEnv,
+    FireResetEnv,
+    NoopResetEnv,
+    WarpFrame,
+)
+from openrl.envs.wrappers.image_wrappers import TransposeImage
+from openrl.envs.wrappers.monitor import Monitor
+from openrl.modules.common import PPONet as Net
+from openrl.runners.common import PPOAgent as Agent
+from openrl.utils.util import get_system_info
+
+env_wrappers = [
+    Monitor,
+    NoopResetEnv,
+    FireResetEnv,
+    WarpFrame,
+    ClipRewardEnv,
+    TransposeImage,
+]
+
+
+def train():
+    cfg_parser = create_config_parser()
+    cfg = cfg_parser.parse_args()
+
+    # create environment, set environment parallelism to 9
+    env = make("ALE/Pong-v5", env_num=9, cfg=cfg, env_wrappers=env_wrappers)
+
+    # create the neural network
+
+    net = Net(
+        env, cfg=cfg, device="cuda" if "macOS" not in get_system_info()["OS"] else "cpu"
+    )
+    # initialize the trainer
+    agent = Agent(net, use_wandb=True)
+    # start training, set total number of training steps to 20000
+
+    # agent.train(total_time_steps=1000)
+    agent.train(total_time_steps=5000000)
+    env.close()
+    agent.save("./ppo_agent/")
+    return agent
+
+
+def evaluation(agent):
+    # begin to test
+    # Create an environment for testing and set the number of environments to interact with to 9. Set rendering mode to group_human.
+    env = make(
+        "ALE/Pong-v5",
+        render_mode=None if "Linux" in get_system_info()["OS"] else "group_human",
+        env_num=3,
+        asynchronous=False,
+        env_wrappers=env_wrappers,
+    )
+
+    # The trained agent sets up the interactive environment it needs.
+    agent.set_env(env)
+    # Initialize the environment and get initial observations and environmental information.
+
+    obs, info = env.reset(seed=0)
+    done = False
+    step = 0
+    totoal_reward = 0
+    while not np.any(done):
+        # Based on environmental observation input, predict next action.
+        action, _ = agent.act(obs, deterministic=True)
+        obs, r, done, info = env.step(action)
+        step += 1
+        if step % 100 == 0:
+            print(f"{step}: reward:{np.mean(r)}")
+        totoal_reward += np.mean(r)
+    env.close()
+    print(f"total reward: {totoal_reward}")
+
+
+if __name__ == "__main__":
+    agent = train()
+    evaluation(agent)
diff --git a/examples/cartpole/train_dqn_beta.py b/examples/cartpole/train_dqn_beta.py
@@ -1,10 +1,10 @@
 """"""
 import numpy as np
 
+from openrl.configs.config import create_config_parser
 from openrl.envs.common import make
 from openrl.modules.common import DQNNet as Net
 from openrl.runners.common import DQNAgent as Agent
-from openrl.configs.config import create_config_parser
 
 
 def train():

diff --git a/examples/cartpole/train_ppo.py b/examples/cartpole/train_ppo.py
@@ -47,4 +47,4 @@ def evaluation(agent):
 
 if __name__ == "__main__":
     agent = train()
-    evaluation(agent)
+    evaluation(agent)
diff --git a/examples/gridworld/train_dqn.py b/examples/gridworld/train_dqn.py
@@ -1,10 +1,10 @@
 """"""
 import numpy as np
 
+from openrl.configs.config import create_config_parser
 from openrl.envs.common import make
 from openrl.modules.common import DQNNet as Net
 from openrl.runners.common import DQNAgent as Agent
-from openrl.configs.config import create_config_parser
 
 
 def train():
@@ -13,10 +13,7 @@ def train():
     cfg = cfg_parser.parse_args(["--config", "dqn_gridworld.yaml"])
 
     # 创建 环境
-    env = make(
-        "GridWorldEnv",
-        env_num=9
-    )
+    env = make("GridWorldEnv", env_num=9)
     # 创建 神经网络
     net = Net(env, cfg=cfg)
     # 初始化训练器

diff --git a/examples/gridworld/train_ppo.py b/examples/gridworld/train_ppo.py
@@ -47,4 +47,4 @@ def evaluation(agent):
 
 if __name__ == "__main__":
     agent = train()
-    evaluation(agent)
+    evaluation(agent)
diff --git a/examples/mujoco/README.md b/examples/mujoco/README.md
@@ -1,6 +1,6 @@
 ## Installation
 
-`pip install mujoco`
+xxx
 
 ## Usage
 

diff --git a/openrl/algorithms/dqn.py b/openrl/algorithms/dqn.py
@@ -124,7 +124,9 @@ def dqn_update(self, sample, turn_on=True):
 
         if self.update_count % self.target_update_frequency == 0:
             self.update_count = 0
-            self.algo_module.models["target_q_net"].load_state_dict(self.algo_module.models["q_net"].state_dict())
+            self.algo_module.models["target_q_net"].load_state_dict(
+                self.algo_module.models["q_net"].state_dict()
+            )
         return loss
 
     def cal_value_loss(
@@ -228,7 +230,7 @@ def train(self, buffer, turn_on=True):
             data_generator = buffer.feed_forward_generator(
                 None,
                 num_mini_batch=self.num_mini_batch,
-                mini_batch_size=self.mini_batch_size
+                mini_batch_size=self.mini_batch_size,
             )
 
             for sample in data_generator:

diff --git a/openrl/algorithms/vdn.py b/openrl/algorithms/vdn.py
@@ -123,12 +123,12 @@ def dqn_update(self, sample, turn_on=True):
         return loss
 
     def cal_value_loss(
-            self,
-            value_normalizer,
-            values,
-            value_preds_batch,
-            return_batch,
-            active_masks_batch,
+        self,
+        value_normalizer,
+        values,
+        value_preds_batch,
+        return_batch,
+        active_masks_batch,
     ):
         value_pred_clipped = value_preds_batch + (values - value_preds_batch).clamp(
             -self.clip_param, self.clip_param
@@ -137,7 +137,7 @@ def cal_value_loss(
         if self._use_popart or self._use_valuenorm:
             value_normalizer.update(return_batch)
             error_clipped = (
-                    value_normalizer.normalize(return_batch) - value_pred_clipped
+                value_normalizer.normalize(return_batch) - value_pred_clipped
             )
             error_original = value_normalizer.normalize(return_batch) - values
         else:
@@ -158,8 +158,8 @@ def cal_value_loss(
 
         if self._use_value_active_masks:
             value_loss = (
-                                 value_loss * active_masks_batch
-                         ).sum() / active_masks_batch.sum()
+                value_loss * active_masks_batch
+            ).sum() / active_masks_batch.sum()
         else:
             value_loss = value_loss.mean()
 
@@ -197,9 +197,7 @@ def prepare_loss(
             critic_masks_batch=critic_masks_batch,
         )
 
-        rewards_batch = rewards_batch.reshape(
-            -1, self.n_agent, 1
-        )
+        rewards_batch = rewards_batch.reshape(-1, self.n_agent, 1)
         rewards_batch = torch.sum(rewards_batch, dim=2, keepdim=True).view(-1, 1)
         q_targets = rewards_batch + self.gamma * max_next_q_values
         q_loss = torch.mean(F.mse_loss(q_values, q_targets.detach()))  # 均方误差损失函数
@@ -227,7 +225,7 @@ def train(self, buffer, turn_on=True):
                 data_generator = buffer.feed_forward_generator(
                     None,
                     num_mini_batch=self.num_mini_batch,
-                    #mini_batch_size=self.mini_batch_size,
+                    # mini_batch_size=self.mini_batch_size,
                 )
 
             for sample in data_generator:
@@ -253,6 +251,8 @@ def train(self, buffer, turn_on=True):
                 optimizer.sync_lookahead()
 
         return train_info
+
+
 '''
 
 #!/usr/bin/env python
@@ -507,4 +507,4 @@ def train(self, buffer, turn_on=True):
                 optimizer.sync_lookahead()
 
         return train_info
-'''
+'''