twni2016 · twni2016 · Mar 1, 2022 · Mar 1, 2022 · Mar 1, 2022 · Mar 1, 2022
diff --git a/configs/pomdp/cartpole/f/mlp.yml b/configs/pomdp/cartpole/f/mlp.yml
@@ -0,0 +1,38 @@
+seed: 73
+cuda: -1 # use_gpu
+env:
+  env_type: pomdp
+  env_name: CartPole-F-v0
+
+  num_eval_tasks: 20 # num of eval episodes
+
+train:
+  # 500*200 = 100k steps
+  num_iters: 500 # number meta-training iterates
+  num_init_rollouts_pool: 5 # before training
+  num_rollouts_per_iter: 1
+  buffer_size: 1e6
+  batch_size: 256
+
+eval:
+  eval_stochastic: false # also eval stochastic policy
+  log_interval: 1 # num of iters
+  save_interval: -1
+  log_tensorboard: true
+
+policy:
+  arch: mlp
+  algo: sacd # only support sac-discrete
+
+  dqn_layers: [128, 128]
+  policy_layers: [128, 128]
+  lr: 0.0003
+  gamma: 0.99
+  tau: 0.005
+
+  # sac alpha
+  entropy_alpha: null
+  automatic_entropy_tuning: true
+  target_entropy: 0.7 # the ratio: target_entropy = ratio * log(|A|)
+  alpha_lr: 0.0003
+
diff --git a/configs/pomdp/cartpole/v/rnn.yml b/configs/pomdp/cartpole/v/rnn.yml
@@ -0,0 +1,49 @@
+seed: 73
+cuda: 0 # use_gpu
+env:
+  env_type: pomdp
+  env_name: CartPole-V-v0
+
+  num_eval_tasks: 20 # num of eval episodes
+
+train:
+  # 50*200 = 10k steps
+  num_iters: 50 # number meta-training iterates
+  num_init_rollouts_pool: 5 # before training
+  num_rollouts_per_iter: 1
+
+  num_updates_per_iter: 1.0
+  # buffer params
+  buffer_size: 1e6
+  batch_size: 32 # to tune based on sampled_seq_len
+  sampled_seq_len: -1 # -1 is all, or positive integer
+  sample_weight_baseline: 0.0
+
+eval:
+  eval_stochastic: false # also eval stochastic policy
+  log_interval: 1 # num of iters
+  save_interval: -1
+  log_tensorboard: true
+
+policy:
+  separate: True
+  arch: lstm # [lstm, gru]
+  algo: sacd # only support sac-discrete
+
+  action_embedding_size: 8 # no action input
+  state_embedding_size: 32
+  reward_embedding_size: 8
+  rnn_hidden_size: 128
+
+  dqn_layers: [128, 128]
+  policy_layers: [128, 128]
+  lr: 0.0003
+  gamma: 0.9
+  tau: 0.005
+
+  # sacd alpha
+  entropy_alpha: null
+  automatic_entropy_tuning: true
+  target_entropy: 0.7 # the ratio: target_entropy = ratio * log(|A|)
+  alpha_lr: 0.0003
+
diff --git a/configs/pomdp/lunalander/f/mlp.yml b/configs/pomdp/lunalander/f/mlp.yml
@@ -0,0 +1,38 @@
+seed: 73
+cuda: -1 # use_gpu
+env:
+  env_type: pomdp
+  env_name: LunarLander-F-v0
+
+  num_eval_tasks: 20 # num of eval episodes
+
+train:
+  # 500*1000 = 500k steps
+  num_iters: 500 # number meta-training iterates
+  num_init_rollouts_pool: 5 # before training
+  num_rollouts_per_iter: 1
+  buffer_size: 1e6
+  batch_size: 256
+
+eval:
+  eval_stochastic: false # also eval stochastic policy
+  log_interval: 1 # num of iters
+  save_interval: -1
+  log_tensorboard: true
+
+policy:
+  arch: mlp
+  algo: sacd # only support sac-discrete
+
+  dqn_layers: [128, 128]
+  policy_layers: [128, 128]
+  lr: 0.0003
+  gamma: 0.99
+  tau: 0.005
+
+  # sac alpha
+  entropy_alpha: null
+  automatic_entropy_tuning: true
+  target_entropy: 0.7 # the ratio: target_entropy = ratio * log(|A|)
+  alpha_lr: 0.0003
+
diff --git a/configs/pomdp/lunalander/v/rnn.yml b/configs/pomdp/lunalander/v/rnn.yml
@@ -0,0 +1,49 @@
+seed: 73
+cuda: 0 # use_gpu
+env:
+  env_type: pomdp
+  env_name: LunarLander-V-v0
+
+  num_eval_tasks: 20 # num of eval episodes
+
+train:
+  # 200*1000 = 200k steps
+  num_iters: 200 # number meta-training iterates
+  num_init_rollouts_pool: 5 # before training
+  num_rollouts_per_iter: 1
+
+  num_updates_per_iter: 0.2
+  # buffer params
+  buffer_size: 1e6
+  batch_size: 32 # to tune based on sampled_seq_len
+  sampled_seq_len: -1 # -1 is all, or positive integer
+  sample_weight_baseline: 0.0
+
+eval:
+  eval_stochastic: false # also eval stochastic policy
+  log_interval: 4 # num of iters
+  save_interval: -1
+  log_tensorboard: true
+
+policy:
+  separate: True
+  arch: lstm # [lstm, gru]
+  algo: sacd # only support sac-discrete
+
+  action_embedding_size: 8 # no action input
+  state_embedding_size: 32
+  reward_embedding_size: 8
+  rnn_hidden_size: 128
+
+  dqn_layers: [128, 128]
+  policy_layers: [128, 128]
+  lr: 0.0003
+  gamma: 0.99
+  tau: 0.005
+
+  # sacd alpha
+  entropy_alpha: null
+  automatic_entropy_tuning: true
+  target_entropy: 0.7 # the ratio: target_entropy = ratio * log(|A|)
+  alpha_lr: 0.0003
+
diff --git a/configs/pomdp/pendulum/v/varibad.yml b/configs/pomdp/pendulum/v/varibad.yml
diff --git a/docs/acknowledge.md b/docs/acknowledge.md
@@ -8,6 +8,7 @@ We acknowledge the following repositories that greatly shaped our implementation
 - https://github.com/oist-cnru/Variational-Recurrent-Models for providing the pomdp VRM algorithm and environments
 - https://github.com/quantumiracle/Popular-RL-Algorithms for inspiring the recurrent policies design
 - https://github.com/lmzintgraf/varibad for inspiring the recurrent policies design and providing learning curve data
+- https://github.com/ku2482/sac-discrete.pytorch for providing the SAC-discrete code
 
 Please cite their work if you also find their code useful to your project:
 ```
@@ -58,4 +59,10 @@ Please cite their work if you also find their code useful to your project:
  note={\url{http://www.deepreinforcementlearningbook.org}},
  year={2020}
 }
+@article{christodoulou2019soft,
+  title={Soft actor-critic for discrete action settings},
+  author={Christodoulou, Petros},
+  journal={arXiv preprint arXiv:1910.07207},
+  year={2019}
+}
 ```
diff --git a/docs/run_commands.md b/docs/run_commands.md
@@ -4,7 +4,7 @@
 
 Before start running any experiments, we suggest to have a good plan of *environment series* based on difficulty level. As it is hard to analyze and varies from algorithm to algorithm, we provide some rough estimates:
 
-1. Extremely Simple as a Sanity Check: Pendulum-V (also shown in our minimal example jupyter notebook)
+1. Extremely Simple as a Sanity Check: Pendulum-V (also shown in our minimal example jupyter notebook) and CartPole-V (for discrete action space)
 2. Simple, Fast, yet Non-trivial: Wind (require precise inference and control), Semi-Circle (sparse reward). Both are continuous gridworlds, thus very fast.
 3. Medium: Cheetah-Vel (1-dim stationary hidden state), `*`-Robust (2-dim stationary hidden state), `*`-P (could be roughly inferred by 2nd order MDP)
 4. Hard: `*`-Dir (relatively complicated dynamics), `*`-V (long-term inference), `*`-Generalize (extrapolation)
@@ -44,6 +44,12 @@ python PPO/main.py --config configs/pomdp/ant_blt/p/ppo_rnn.yml \
 python VRM/run_experiment.py configs/pomdp/ant_blt/p/vrm.yml
 ``` 
 
+Mar 2022: we support recurrent SAC-discrete for POMDPs with **discrete action space**. Take CartPole-V as example:
+```
+python policies/main.py --cfg configs/pomdp/cartpole/v/rnn.yml --target_entropy 0.7
+```
+See [this PR for detailed instructions](https://github.com/twni2016/pomdp-baselines/pull/1).
+
 ### Meta RL 
 
 {Semi-Circle, Wind, Cheetah-Vel} in the paper, corresponding to `configs/meta/<point_robot|wind|cheetah_vel|ant_dir>`. Among them, Cheetah-Vel requires MuJoCo, and Semi-Circle can serve as a sanity check. Wind looks simple but not very easy to solve.

diff --git a/envs/pomdp/__init__.py b/envs/pomdp/__init__.py
@@ -29,6 +29,52 @@
     max_episode_steps=200,
 )
 
+register(
+    "CartPole-F-v0",
+    entry_point="envs.pomdp.wrappers:POMDPWrapper",
+    kwargs=dict(
+        env=gym.make("CartPole-v0"), partially_obs_dims=[0, 1, 2, 3]
+    ),  # angle & velocity
+    max_episode_steps=200,  # reward threshold for solving the task: 195
+)
+
+register(
+    "CartPole-P-v0",
+    entry_point="envs.pomdp.wrappers:POMDPWrapper",
+    kwargs=dict(env=gym.make("CartPole-v0"), partially_obs_dims=[0, 2]),
+    max_episode_steps=200,
+)
+
+register(
+    "CartPole-V-v0",
+    entry_point="envs.pomdp.wrappers:POMDPWrapper",
+    kwargs=dict(env=gym.make("CartPole-v0"), partially_obs_dims=[1, 3]),
+    max_episode_steps=200,
+)
+
+register(
+    "LunarLander-F-v0",
+    entry_point="envs.pomdp.wrappers:POMDPWrapper",
+    kwargs=dict(
+        env=gym.make("LunarLander-v2"), partially_obs_dims=list(range(8))
+    ),  # angle & velocity
+    max_episode_steps=1000,  # reward threshold for solving the task: 200
+)
+
+register(
+    "LunarLander-P-v0",
+    entry_point="envs.pomdp.wrappers:POMDPWrapper",
+    kwargs=dict(env=gym.make("LunarLander-v2"), partially_obs_dims=[0, 1, 4, 6, 7]),
+    max_episode_steps=1000,
+)
+
+register(
+    "LunarLander-V-v0",
+    entry_point="envs.pomdp.wrappers:POMDPWrapper",
+    kwargs=dict(env=gym.make("LunarLander-v2"), partially_obs_dims=[2, 3, 5, 6, 7]),
+    max_episode_steps=1000,
+)
+
 ### Below are pybullect (roboschool) environments, using BLT for Bullet
 import pybullet_envs
 

diff --git a/envs/pomdp/wrappers.py b/envs/pomdp/wrappers.py
@@ -16,9 +16,13 @@ def __init__(self, env, partially_obs_dims: list):
             dtype=np.float32,
         )
 
-        # if continuous actions, make sure in [-1, 1]
-        # NOTE: policy won't use action_space.low/high, just set [-1,1]
-        # this is a bad practice...
+        if self.env.action_space.__class__.__name__ == "Box":
+            self.act_continuous = True
+            # if continuous actions, make sure in [-1, 1]
+            # NOTE: policy won't use action_space.low/high, just set [-1,1]
+            # this is a bad practice...
+        else:
+            self.act_continuous = False
 
     def get_obs(self, state):
         return state[self.partially_obs_dims].copy()
@@ -28,12 +32,13 @@ def reset(self):
         return self.get_obs(state)
 
     def step(self, action):
-        # recover the action
-        action = np.clip(action, -1, 1)  # first clip into [-1, 1]
-        lb = self.env.action_space.low
-        ub = self.env.action_space.high
-        action = lb + (action + 1.0) * 0.5 * (ub - lb)
-        action = np.clip(action, lb, ub)
+        if self.act_continuous:
+            # recover the action
+            action = np.clip(action, -1, 1)  # first clip into [-1, 1]
+            lb = self.env.action_space.low
+            ub = self.env.action_space.high
+            action = lb + (action + 1.0) * 0.5 * (ub - lb)
+            action = np.clip(action, lb, ub)
 
         state, reward, done, info = self.env.step(action)