From a41c245a1abeb36bc9c16352825446973c8e740a Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Fri, 16 Dec 2022 17:40:09 +0100
Subject: [PATCH 01/14] initial

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_with_rl_module.py      | 130 ++++++++++++++++++
 .../ppo/torch/ppo_torch_rl_module.py          |  88 ++++++------
 2 files changed, 178 insertions(+), 40 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index 8feb601b9f00..69cdd58a5342 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -1,12 +1,21 @@
 import numpy as np
+import tree
 import unittest
 
 import ray
 import ray.rllib.algorithms.ppo as ppo
 
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import (
+    get_expected_model_config,
+    PPOTorchRLModule,
+    get_ppo_loss,
+)
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
 from ray.rllib.utils.test_utils import (
     check,
     check_compute_single_action,
@@ -176,6 +185,127 @@ def test_ppo_exploration_setup(self):
             check(np.mean(actions), 1.5, atol=0.2)
             trainer.stop()
 
+    def test_torch_model_creation(self):
+        pass
+
+    def test_torch_model_creation_lstm(self):
+        pass
+
+    def test_rollouts(self):
+        for env_name in ["CartPole-v1", "Pendulum-v1"]:  # , "BreakoutNoFrameskip-v4"]:
+            for fwd_fn in ["forward_exploration", "forward_inference"]:
+                for shared_encoder in [False, True]:
+                    for lstm in [False]:  # , True]"
+                        print(
+                            f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
+                            f"{shared_encoder}] | LSTM={lstm}"
+                        )
+                        import gym
+
+                        env = gym.make(env_name)
+
+                        config = get_expected_model_config(env, lstm, shared_encoder)
+                        module = PPOTorchRLModule(config)
+
+                        obs = env.reset()
+                        if lstm:
+                            states = [
+                                s.get_initial_state()
+                                for s in (
+                                    module.shared_encoder,
+                                    module.encoder_vf,
+                                    module.encoder_pi,
+                                )
+                            ]
+                            batch = {
+                                SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
+                                **{f"state_in_{i}": s for i, s in enumerate(states)},
+                            }
+                        else:
+                            batch = {
+                                SampleBatch.OBS: convert_to_torch_tensor(obs)[None]
+                            }
+
+                        if fwd_fn == "forward_exploration":
+                            module.forward_exploration(batch)
+                        elif fwd_fn == "forward_inference":
+                            module.forward_inference(batch)
+
+    def test_forward_train(self):
+        for env_name in ["CartPole-v1", "Pendulum-v1"]:  # , "BreakoutNoFrameskip-v4"]:
+            for fwd_fn in ["forward_exploration", "forward_inference"]:
+                for shared_encoder in [False, True]:
+                    for lstm in [False]:  # , True]"
+                        print(
+                            f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
+                            f"{shared_encoder}] | LSTM={lstm}"
+                        )
+                        import gym
+
+                        env = gym.make(env_name)
+
+                        config = get_expected_model_config(env, lstm, shared_encoder)
+                        module = PPOTorchRLModule(config)
+
+                        # collect a batch of data
+                        batch = []
+                        obs = env.reset()
+                        tstep = 0
+                        if lstm:
+                            states = {}
+                            for i, model in enumerate(
+                                [
+                                    module.shared_encoder,
+                                    module.encoder_pi,
+                                    module.encoder_vf,
+                                ]
+                            ):
+                                states[i] = model.get_inital_state()
+                        while tstep < 10:
+                            fwd_out = module.forward_exploration(
+                                {"obs": convert_to_torch_tensor(obs)[None]}
+                            )
+                            action = convert_to_numpy(
+                                fwd_out["action_dist"].sample().squeeze(0)
+                            )
+                            new_obs, reward, done, _ = env.step(action)
+                            step = {
+                                SampleBatch.OBS: obs,
+                                SampleBatch.NEXT_OBS: new_obs,
+                                SampleBatch.ACTIONS: action,
+                                SampleBatch.REWARDS: np.array(reward),
+                                SampleBatch.DONES: np.array(done),
+                            }
+                            if lstm:
+                                assert "state_out" in fwd_out
+                                for k, v in states.items():
+                                    step[f"state_in_{k}"] = v
+                                    states[k] = fwd_out["state_out"][k]
+                            batch.append(step)
+                            obs = new_obs
+                            tstep += 1
+
+                        # convert the list of dicts to dict of lists
+                        batch = tree.map_structure(lambda *x: list(x), *batch)
+                        # convert dict of lists to dict of tensors
+                        fwd_in = {
+                            k: convert_to_torch_tensor(np.array(v))
+                            for k, v in batch.items()
+                        }
+
+                        # forward train
+                        # before training make sure it's on the right device and it's on
+                        # trianing mode
+                        module.to("cpu")
+                        module.train()
+                        fwd_out = module.forward_train(fwd_in)
+                        loss = get_ppo_loss(fwd_in, fwd_out)
+                        loss.backward()
+
+                        # check that all neural net parameters have gradients
+                        for param in module.parameters():
+                            self.assertIsNotNone(param.grad)
+
 
 if __name__ == "__main__":
     import pytest
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index 83758c983b9f..43ea05fe1285 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -42,37 +42,45 @@ def get_ppo_loss(fwd_in, fwd_out):
 
 # TODO: Most of the neural network, and model specs in this file will eventually be
 # retreived from the model catalog. That includes FCNet, Encoder, etc.
-def get_shared_encoder_config(env):
-    return PPOModuleConfig(
-        observation_space=env.observation_space,
-        action_space=env.action_space,
-        encoder_config=FCConfig(
-            hidden_layers=[32],
-            activation="ReLU",
-        ),
-        pi_config=FCConfig(
-            hidden_layers=[32],
-            activation="ReLU",
-        ),
-        vf_config=FCConfig(
-            hidden_layers=[32],
-            activation="ReLU",
-        ),
+def get_expected_model_config(env, lstm, shared_encoder):
+    assert not env.observation_space.shape[-1] == 3, "Implement VisionNet first"
+    if lstm:
+        encoder_config_class = LSTMConfig
+    else:
+        encoder_config_class = FCConfig
+
+    pi_config = encoder_config_class(
+        hidden_layers=[32],
+        activation="ReLU",
+    )
+    vf_config = encoder_config_class(
+        input_dim=32,
+        hidden_layers=[32],
+        activation="ReLU",
     )
 
+    if shared_encoder:
+        shared_encoder_config = (
+            encoder_config_class(
+                input_dim=env.observation_space.shape[0],
+                hidden_layers=[32],
+                activation="ReLU",
+            ),
+        )
+        pi_config.input_dim = 32
+        vf_config.input_dim = 32
+    else:
+        shared_encoder_config = None
+        pi_config.input_dim = env.observation_space.shape[0]
+        vf_config.input_dim = env.observation_space.shape[0]
 
-def get_separate_encoder_config(env):
     return PPOModuleConfig(
         observation_space=env.observation_space,
         action_space=env.action_space,
-        pi_config=FCConfig(
-            hidden_layers=[32],
-            activation="ReLU",
-        ),
-        vf_config=FCConfig(
-            hidden_layers=[32],
-            activation="ReLU",
-        ),
+        shared_encoder_config=shared_encoder_config,
+        pi_config=pi_config,
+        vf_config=vf_config,
+        shared_encoder=shared_encoder,
     )
 
 
@@ -83,7 +91,7 @@ class PPOModuleConfig(RLModuleConfig):
     Attributes:
         pi_config: The configuration for the policy network.
         vf_config: The configuration for the value network.
-        encoder_config: The configuration for the encoder network.
+        shared_encoder_config: The configuration for the encoder network.
         free_log_std: For DiagGaussian action distributions, make the second half of
             the model outputs floating bias variables instead of state-dependent. This
             only has an effect is using the default fully connected net.
@@ -92,7 +100,7 @@ class PPOModuleConfig(RLModuleConfig):
 
     pi_config: FCConfig = None
     vf_config: FCConfig = None
-    encoder_config: FCConfig = None
+    shared_encoder_config: FCConfig = None
     free_log_std: bool = False
     shared_encoder: bool = True
 
@@ -109,13 +117,13 @@ def setup(self) -> None:
         assert self.config.vf_config, "vf_config must be provided."
 
         if self.config.shared_encoder:
-            self.shared_encoder = self.config.encoder_config.build()
-            self.encoder_pi = IdentityEncoder(self.config.encoder_config)
-            self.encoder_vf = IdentityEncoder(self.config.encoder_config)
+            self.shared_encoder = self.config.shared_encoder_config.build()
+            self.encoder_pi = IdentityEncoder(self.config.pi_config)
+            self.encoder_vf = IdentityEncoder(self.config.vf_config)
         else:
-            self.shared_encoder = IdentityEncoder(self.config.encoder_config)
-            self.encoder_pi = self.config.encoder_config.build()
-            self.encoder_vf = self.config.encoder_config.build()
+            self.shared_encoder = IdentityEncoder(self.config.shared_encoder_config)
+            self.encoder_pi = self.config.pi_config.build()
+            self.encoder_vf = self.config.vf_config.build()
 
         self.pi = FCNet(
             input_dim=self.config.pi_config.input_dim,
@@ -162,14 +170,14 @@ def from_model_config(
 
         if use_lstm:
             assert vf_share_layers, "LSTM not supported with vf_share_layers=False"
-            encoder_config = LSTMConfig(
+            shared_encoder_config = LSTMConfig(
                 hidden_dim=model_config["lstm_cell_size"],
                 batch_first=not model_config["_time_major"],
                 output_dim=model_config["lstm_cell_size"],
                 num_layers=1,
             )
         else:
-            encoder_config = FCConfig(
+            shared_encoder_config = FCConfig(
                 hidden_layers=fcnet_hiddens,
                 activation=activation,
                 output_dim=model_config["fcnet_hiddens"][-1],
@@ -191,8 +199,8 @@ def from_model_config(
         )
 
         # build pi network
-        encoder_config.input_dim = observation_space.shape[0]
-        pi_config.input_dim = encoder_config.output_dim
+        shared_encoder_config.input_dim = observation_space.shape[0]
+        pi_config.input_dim = shared_encoder_config.output_dim
 
         if isinstance(action_space, gym.spaces.Discrete):
             pi_config.output_dim = action_space.n
@@ -200,14 +208,14 @@ def from_model_config(
             pi_config.output_dim = action_space.shape[0] * 2
 
         # build vf network
-        vf_config.input_dim = encoder_config.output_dim
+        vf_config.input_dim = shared_encoder_config.output_dim
         vf_config.output_dim = 1
 
         config_ = PPOModuleConfig(
             observation_space=observation_space,
             action_space=action_space,
             max_seq_len=model_config["max_seq_len"],
-            encoder_config=encoder_config,
+            shared_encoder_config=shared_encoder_config,
             pi_config=pi_config,
             vf_config=vf_config,
             free_log_std=free_log_std,
@@ -218,7 +226,7 @@ def from_model_config(
         return module
 
     def get_initial_state(self) -> NestedDict:
-        if isinstance(self.config.encoder_config, LSTMConfig):
+        if isinstance(self.config.shared_encoder_config, LSTMConfig):
             # TODO (Kourosh): How does this work in RLlib today?
             if isinstance(self.shared_encoder, LSTMEncoder):
                 return self.shared_encoder.get_inital_state()

From a15a82d46b4ed087f8c7675dabbada289ff9bc9a Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Fri, 16 Dec 2022 17:58:47 +0100
Subject: [PATCH 02/14] tests complete

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/torch/ppo_torch_rl_module.py          | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index 43ea05fe1285..d5bfe93bdfb2 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -43,29 +43,27 @@ def get_ppo_loss(fwd_in, fwd_out):
 # TODO: Most of the neural network, and model specs in this file will eventually be
 # retreived from the model catalog. That includes FCNet, Encoder, etc.
 def get_expected_model_config(env, lstm, shared_encoder):
-    assert not env.observation_space.shape[-1] == 3, "Implement VisionNet first"
     if lstm:
         encoder_config_class = LSTMConfig
     else:
         encoder_config_class = FCConfig
 
     pi_config = encoder_config_class(
+        output_dim=32,
         hidden_layers=[32],
         activation="ReLU",
     )
     vf_config = encoder_config_class(
-        input_dim=32,
+        output_dim=32,
         hidden_layers=[32],
         activation="ReLU",
     )
 
     if shared_encoder:
-        shared_encoder_config = (
-            encoder_config_class(
-                input_dim=env.observation_space.shape[0],
-                hidden_layers=[32],
-                activation="ReLU",
-            ),
+        shared_encoder_config = encoder_config_class(
+            input_dim=env.observation_space.shape[0],
+            hidden_layers=[32],
+            activation="ReLU",
         )
         pi_config.input_dim = 32
         vf_config.input_dim = 32
@@ -126,15 +124,15 @@ def setup(self) -> None:
             self.encoder_vf = self.config.vf_config.build()
 
         self.pi = FCNet(
-            input_dim=self.config.pi_config.input_dim,
-            output_dim=self.config.pi_config.output_dim,
+            input_dim=self.config.pi_config.output_dim,
+            output_dim=2,
             hidden_layers=self.config.pi_config.hidden_layers,
             activation=self.config.pi_config.activation,
         )
 
         self.vf = FCNet(
-            input_dim=self.config.vf_config.input_dim,
-            output_dim=self.config.vf_config.output_dim,
+            input_dim=self.config.vf_config.output_dim,
+            output_dim=1,
             hidden_layers=self.config.vf_config.hidden_layers,
             activation=self.config.vf_config.activation,
         )

From 37d57089543947a371e0c1116ccd85a6dfb18d6a Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Fri, 16 Dec 2022 18:07:23 +0100
Subject: [PATCH 03/14] wip

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_with_rl_module.py      | 69 +++++++++++++++----
 .../ppo/torch/ppo_torch_rl_module.py          | 42 -----------
 2 files changed, 55 insertions(+), 56 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index 69cdd58a5342..80f2e0c50b33 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -1,27 +1,74 @@
+import unittest
+
 import numpy as np
 import tree
-import unittest
 
 import ray
 import ray.rllib.algorithms.ppo as ppo
-
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
 from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import (
-    get_expected_model_config,
+    PPOModuleConfig,
     PPOTorchRLModule,
     get_ppo_loss,
 )
-from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+from ray.rllib.core.rl_module.encoder import (
+    FCConfig,
+    LSTMConfig,
+)
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
+from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.test_utils import (
     check,
     check_compute_single_action,
     check_train_results,
     framework_iterator,
 )
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+
+# TODO: Most of the neural network, and model specs in this file will eventually be
+# retreived from the model catalog. That includes FCNet, Encoder, etc.
+def get_expected_model_config(env, lstm, shared_encoder):
+    assert not len(env.observation_space.shape) == 3, "Implement VisionNet first!"
+    if lstm:
+        encoder_config_class = LSTMConfig
+    else:
+        encoder_config_class = FCConfig
+
+    pi_config = encoder_config_class(
+        output_dim=32,
+        hidden_layers=[32],
+        activation="ReLU",
+    )
+    vf_config = encoder_config_class(
+        output_dim=32,
+        hidden_layers=[32],
+        activation="ReLU",
+    )
+
+    if shared_encoder:
+        shared_encoder_config = encoder_config_class(
+            input_dim=env.observation_space.shape[0],
+            hidden_layers=[32],
+            activation="ReLU",
+        )
+        pi_config.input_dim = 32
+        vf_config.input_dim = 32
+    else:
+        shared_encoder_config = None
+        pi_config.input_dim = env.observation_space.shape[0]
+        vf_config.input_dim = env.observation_space.shape[0]
+
+    return PPOModuleConfig(
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        shared_encoder_config=shared_encoder_config,
+        pi_config=pi_config,
+        vf_config=vf_config,
+        shared_encoder=shared_encoder,
+    )
 
 
 class MyCallbacks(DefaultCallbacks):
@@ -185,12 +232,6 @@ def test_ppo_exploration_setup(self):
             check(np.mean(actions), 1.5, atol=0.2)
             trainer.stop()
 
-    def test_torch_model_creation(self):
-        pass
-
-    def test_torch_model_creation_lstm(self):
-        pass
-
     def test_rollouts(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:  # , "BreakoutNoFrameskip-v4"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index d5bfe93bdfb2..54228e66ce60 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -40,48 +40,6 @@ def get_ppo_loss(fwd_in, fwd_out):
     return loss
 
 
-# TODO: Most of the neural network, and model specs in this file will eventually be
-# retreived from the model catalog. That includes FCNet, Encoder, etc.
-def get_expected_model_config(env, lstm, shared_encoder):
-    if lstm:
-        encoder_config_class = LSTMConfig
-    else:
-        encoder_config_class = FCConfig
-
-    pi_config = encoder_config_class(
-        output_dim=32,
-        hidden_layers=[32],
-        activation="ReLU",
-    )
-    vf_config = encoder_config_class(
-        output_dim=32,
-        hidden_layers=[32],
-        activation="ReLU",
-    )
-
-    if shared_encoder:
-        shared_encoder_config = encoder_config_class(
-            input_dim=env.observation_space.shape[0],
-            hidden_layers=[32],
-            activation="ReLU",
-        )
-        pi_config.input_dim = 32
-        vf_config.input_dim = 32
-    else:
-        shared_encoder_config = None
-        pi_config.input_dim = env.observation_space.shape[0]
-        vf_config.input_dim = env.observation_space.shape[0]
-
-    return PPOModuleConfig(
-        observation_space=env.observation_space,
-        action_space=env.action_space,
-        shared_encoder_config=shared_encoder_config,
-        pi_config=pi_config,
-        vf_config=vf_config,
-        shared_encoder=shared_encoder,
-    )
-
-
 @dataclass
 class PPOModuleConfig(RLModuleConfig):
     """Configuration for the PPO module.

From 2d405693cc3c129561cd1588676c7081890c77ce Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Fri, 16 Dec 2022 19:06:05 +0100
Subject: [PATCH 04/14] wip

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_with_rl_module.py      | 103 ++++++++++--------
 .../ppo/torch/ppo_torch_rl_module.py          |  34 +++---
 rllib/core/rl_module/encoder.py               |   2 +-
 3 files changed, 76 insertions(+), 63 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index 80f2e0c50b33..acc80398fd68 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import tree
+import gym
 
 import ray
 import ray.rllib.algorithms.ppo as ppo
@@ -28,43 +29,55 @@
 from ray.rllib.utils.torch_utils import convert_to_torch_tensor
 
 
-# TODO: Most of the neural network, and model specs in this file will eventually be
-# retreived from the model catalog. That includes FCNet, Encoder, etc.
+# Get a model config that we expect from the catalog
 def get_expected_model_config(env, lstm, shared_encoder):
     assert not len(env.observation_space.shape) == 3, "Implement VisionNet first!"
-    if lstm:
-        encoder_config_class = LSTMConfig
-    else:
-        encoder_config_class = FCConfig
 
-    pi_config = encoder_config_class(
-        output_dim=32,
+    shared_encoder_config = FCConfig(
+        input_dim=env.observation_space.shape[0],
         hidden_layers=[32],
         activation="ReLU",
     )
-    vf_config = encoder_config_class(
+
+    if lstm:
+        pi_encoder_config = LSTMConfig(
+            hidden_dim=32,
+            batch_first=True,
+            output_dim=32,
+            num_layers=1,
+        )
+    else:
+        pi_encoder_config = FCConfig(
+            output_dim=32,
+            hidden_layers=[32],
+            activation="ReLU",
+        )
+
+    vf_encoder_config = FCConfig(
         output_dim=32,
         hidden_layers=[32],
         activation="ReLU",
     )
 
-    if shared_encoder:
-        shared_encoder_config = encoder_config_class(
-            input_dim=env.observation_space.shape[0],
-            hidden_layers=[32],
-            activation="ReLU",
-        )
-        pi_config.input_dim = 32
-        vf_config.input_dim = 32
-    else:
+    if not shared_encoder:
         shared_encoder_config = None
-        pi_config.input_dim = env.observation_space.shape[0]
-        vf_config.input_dim = env.observation_space.shape[0]
+        pi_encoder_config.input_dim = env.observation_space.shape[0]
+        vf_encoder_config.input_dim = env.observation_space.shape[0]
+
+    pi_config = FCConfig()
+    vf_config = FCConfig()
+
+    if isinstance(env.action_space, gym.spaces.Discrete):
+        pi_config.output_dim = env.action_space.n
+    else:
+        pi_config.output_dim = env.action_space.shape[0] * 2
 
     return PPOModuleConfig(
         observation_space=env.observation_space,
         action_space=env.action_space,
         shared_encoder_config=shared_encoder_config,
+        pi_encoder_config=pi_encoder_config,
+        vf_encoder_config=vf_encoder_config,
         pi_config=pi_config,
         vf_config=vf_config,
         shared_encoder=shared_encoder,
@@ -233,10 +246,11 @@ def test_ppo_exploration_setup(self):
             trainer.stop()
 
     def test_rollouts(self):
-        for env_name in ["CartPole-v1", "Pendulum-v1"]:  # , "BreakoutNoFrameskip-v4"]:
+        # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
+        for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    for lstm in [False]:  # , True]"
+                    for lstm in [False, True]:
                         print(
                             f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
                             f"{shared_encoder}] | LSTM={lstm}"
@@ -249,18 +263,11 @@ def test_rollouts(self):
                         module = PPOTorchRLModule(config)
 
                         obs = env.reset()
+
                         if lstm:
-                            states = [
-                                s.get_initial_state()
-                                for s in (
-                                    module.shared_encoder,
-                                    module.encoder_vf,
-                                    module.encoder_pi,
-                                )
-                            ]
                             batch = {
                                 SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
-                                **{f"state_in_{i}": s for i, s in enumerate(states)},
+                                "state_in": module.pi_encoder.get_inital_state()
                             }
                         else:
                             batch = {
@@ -273,10 +280,11 @@ def test_rollouts(self):
                             module.forward_inference(batch)
 
     def test_forward_train(self):
-        for env_name in ["CartPole-v1", "Pendulum-v1"]:  # , "BreakoutNoFrameskip-v4"]:
+        # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
+        for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    for lstm in [False]:  # , True]"
+                    for lstm in [False,  True]:
                         print(
                             f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
                             f"{shared_encoder}] | LSTM={lstm}"
@@ -293,19 +301,19 @@ def test_forward_train(self):
                         obs = env.reset()
                         tstep = 0
                         if lstm:
-                            states = {}
-                            for i, model in enumerate(
-                                [
-                                    module.shared_encoder,
-                                    module.encoder_pi,
-                                    module.encoder_vf,
-                                ]
-                            ):
-                                states[i] = model.get_inital_state()
+                            # TODO (Artur): Multiple states
+                            state_in = module.pi_encoder.get_inital_state()
                         while tstep < 10:
-                            fwd_out = module.forward_exploration(
-                                {"obs": convert_to_torch_tensor(obs)[None]}
-                            )
+                            if lstm:
+                                batch = {
+                                    SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
+                                    "state_in": state_in,
+                                }
+                            else:
+                                batch = {
+                                    SampleBatch.OBS: convert_to_torch_tensor(obs)[None]
+                                }
+                            fwd_out = module.forward_exploration(batch)
                             action = convert_to_numpy(
                                 fwd_out["action_dist"].sample().squeeze(0)
                             )
@@ -319,9 +327,8 @@ def test_forward_train(self):
                             }
                             if lstm:
                                 assert "state_out" in fwd_out
-                                for k, v in states.items():
-                                    step[f"state_in_{k}"] = v
-                                    states[k] = fwd_out["state_out"][k]
+                                step[f"state_in"] = state_in
+                                state_in = fwd_out["state_out"]
                             batch.append(step)
                             obs = new_obs
                             tstep += 1
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index 54228e66ce60..bb1bf8701fa1 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -56,6 +56,8 @@ class PPOModuleConfig(RLModuleConfig):
 
     pi_config: FCConfig = None
     vf_config: FCConfig = None
+    pi_encoder_config: FCConfig = None
+    vf_encoder_config: FCConfig = None
     shared_encoder_config: FCConfig = None
     free_log_std: bool = False
     shared_encoder: bool = True
@@ -74,22 +76,22 @@ def setup(self) -> None:
 
         if self.config.shared_encoder:
             self.shared_encoder = self.config.shared_encoder_config.build()
-            self.encoder_pi = IdentityEncoder(self.config.pi_config)
-            self.encoder_vf = IdentityEncoder(self.config.vf_config)
+            self.pi_encoder = IdentityEncoder(self.config.pi_encoder_config)
+            self.vf_encoder = IdentityEncoder(self.config.vf_encoder_config)
         else:
             self.shared_encoder = IdentityEncoder(self.config.shared_encoder_config)
-            self.encoder_pi = self.config.pi_config.build()
-            self.encoder_vf = self.config.vf_config.build()
+            self.pi_encoder = self.config.pi_encoder_config.build()
+            self.vf_encoder = self.config.vf_encoder_config.build()
 
         self.pi = FCNet(
-            input_dim=self.config.pi_config.output_dim,
-            output_dim=2,
+            input_dim=self.config.pi_encoder_config.output_dim,
+            output_dim=self.config.pi_config.output_dim,
             hidden_layers=self.config.pi_config.hidden_layers,
             activation=self.config.pi_config.activation,
         )
 
         self.vf = FCNet(
-            input_dim=self.config.vf_config.output_dim,
+            input_dim=self.config.vf_encoder_config.output_dim,
             output_dim=1,
             hidden_layers=self.config.vf_config.hidden_layers,
             activation=self.config.vf_config.activation,
@@ -139,6 +141,8 @@ def from_model_config(
                 output_dim=model_config["fcnet_hiddens"][-1],
             )
 
+        pi_encoder_config = FCConfig()
+        vf_encoder_config = FCConfig()
         pi_config = FCConfig()
         vf_config = FCConfig()
 
@@ -156,7 +160,7 @@ def from_model_config(
 
         # build pi network
         shared_encoder_config.input_dim = observation_space.shape[0]
-        pi_config.input_dim = shared_encoder_config.output_dim
+        pi_encoder_config.input_dim = shared_encoder_config.output_dim
 
         if isinstance(action_space, gym.spaces.Discrete):
             pi_config.output_dim = action_space.n
@@ -174,6 +178,8 @@ def from_model_config(
             shared_encoder_config=shared_encoder_config,
             pi_config=pi_config,
             vf_config=vf_config,
+            pi_encoder_config=pi_encoder_config,
+            vf_encoder_config=vf_encoder_config,
             free_log_std=free_log_std,
             shared_encoder=vf_share_layers,
         )
@@ -187,7 +193,7 @@ def get_initial_state(self) -> NestedDict:
             if isinstance(self.shared_encoder, LSTMEncoder):
                 return self.shared_encoder.get_inital_state()
             else:
-                return self.encoder_pi.get_inital_state()
+                return self.pi_encoder.get_inital_state()
         return {}
 
     @override(RLModule)
@@ -201,7 +207,7 @@ def output_specs_inference(self) -> ModelSpec:
     @override(RLModule)
     def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]:
         encoder_out = self.shared_encoder(batch)
-        encoder_out_pi = self.encoder_pi(encoder_out)
+        encoder_out_pi = self.pi_encoder(encoder_out)
         action_logits = self.pi(encoder_out_pi["embedding"])
 
         if self._is_discrete:
@@ -242,8 +248,8 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
         policy and the new policy during training.
         """
         encoder_out = self.shared_encoder(batch)
-        encoder_out_pi = self.encoder_pi(encoder_out)
-        encoder_out_vf = self.encoder_vf(encoder_out)
+        encoder_out_pi = self.pi_encoder(encoder_out)
+        encoder_out_vf = self.vf_encoder(encoder_out)
         action_logits = self.pi(encoder_out_pi["embedding"])
 
         output = {}
@@ -292,8 +298,8 @@ def output_specs_train(self) -> ModelSpec:
     @override(RLModule)
     def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]:
         encoder_out = self.shared_encoder(batch)
-        encoder_out_pi = self.encoder_pi(encoder_out)
-        encoder_out_vf = self.encoder_vf(encoder_out)
+        encoder_out_pi = self.pi_encoder(encoder_out)
+        encoder_out_vf = self.vf_encoder(encoder_out)
 
         action_logits = self.pi(encoder_out_pi["embedding"])
         vf = self.vf(encoder_out_vf["embedding"])
diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index 2b5e02bed9ae..d99a1a9e7c76 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -63,7 +63,7 @@ def __init__(self, config: EncoderConfig) -> None:
         self._output_spec = self.output_spec()
 
     def get_inital_state(self):
-        raise []
+        return []
 
     def input_spec(self):
         return ModelSpec()

From 74d213b5a791b44989f8a85581a19bd9d277a696 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 18 Dec 2022 22:34:21 +0100
Subject: [PATCH 05/14] mutually exclusive encoders, tests passing

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_with_rl_module.py      | 82 ++++++++++---------
 .../ppo/torch/ppo_torch_rl_module.py          | 62 ++++++++------
 rllib/core/rl_module/encoder.py               | 29 ++++---
 3 files changed, 98 insertions(+), 75 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index acc80398fd68..c19b34dde06b 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -13,6 +13,7 @@
     get_ppo_loss,
 )
 from ray.rllib.core.rl_module.encoder import (
+    IdentityConfig,
     FCConfig,
     LSTMConfig,
 )
@@ -32,38 +33,42 @@
 # Get a model config that we expect from the catalog
 def get_expected_model_config(env, lstm, shared_encoder):
     assert not len(env.observation_space.shape) == 3, "Implement VisionNet first!"
+    obs_dim = env.observation_space.shape[0]
 
-    shared_encoder_config = FCConfig(
-        input_dim=env.observation_space.shape[0],
-        hidden_layers=[32],
-        activation="ReLU",
-    )
-
-    if lstm:
-        pi_encoder_config = LSTMConfig(
-            hidden_dim=32,
-            batch_first=True,
+    if shared_encoder:
+        assert not lstm, "LSTM can only be used in PI"
+        shared_encoder_config = FCConfig(
+            input_dim=obs_dim,
+            hidden_layers=[32],
+            activation="ReLU",
             output_dim=32,
-            num_layers=1,
         )
+        pi_encoder_config = IdentityConfig(output_dim=32)
+        vf_encoder_config = IdentityConfig(output_dim=32)
     else:
-        pi_encoder_config = FCConfig(
+        shared_encoder_config = IdentityConfig(output_dim=obs_dim)
+        if lstm:
+            pi_encoder_config = LSTMConfig(
+                input_dim=obs_dim,
+                hidden_dim=32,
+                batch_first=True,
+                output_dim=32,
+                num_layers=1,
+            )
+        else:
+            pi_encoder_config = FCConfig(
+                input_dim=obs_dim,
+                output_dim=32,
+                hidden_layers=[32],
+                activation="ReLU",
+            )
+        vf_encoder_config = FCConfig(
+            input_dim=obs_dim,
             output_dim=32,
             hidden_layers=[32],
             activation="ReLU",
         )
 
-    vf_encoder_config = FCConfig(
-        output_dim=32,
-        hidden_layers=[32],
-        activation="ReLU",
-    )
-
-    if not shared_encoder:
-        shared_encoder_config = None
-        pi_encoder_config.input_dim = env.observation_space.shape[0]
-        vf_encoder_config.input_dim = env.observation_space.shape[0]
-
     pi_config = FCConfig()
     vf_config = FCConfig()
 
@@ -120,7 +125,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init()
+        ray.init(local_mode=True)
 
     @classmethod
     def tearDownClass(cls):
@@ -250,13 +255,12 @@ def test_rollouts(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    for lstm in [False, True]:
+                    # TODO: LSTM = True
+                    for lstm in [False]:
                         print(
                             f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
                             f"{shared_encoder}] | LSTM={lstm}"
                         )
-                        import gym
-
                         env = gym.make(env_name)
 
                         config = get_expected_model_config(env, lstm, shared_encoder)
@@ -267,7 +271,7 @@ def test_rollouts(self):
                         if lstm:
                             batch = {
                                 SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
-                                "state_in": module.pi_encoder.get_inital_state()
+                                "state_in": module.pi_encoder.get_inital_state(),
                             }
                         else:
                             batch = {
@@ -284,20 +288,19 @@ def test_forward_train(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    for lstm in [False,  True]:
+                    # TODO: LSTM = True
+                    for lstm in [False]:
                         print(
                             f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
                             f"{shared_encoder}] | LSTM={lstm}"
                         )
-                        import gym
-
                         env = gym.make(env_name)
 
                         config = get_expected_model_config(env, lstm, shared_encoder)
                         module = PPOTorchRLModule(config)
 
                         # collect a batch of data
-                        batch = []
+                        batches = []
                         obs = env.reset()
                         tstep = 0
                         if lstm:
@@ -305,20 +308,21 @@ def test_forward_train(self):
                             state_in = module.pi_encoder.get_inital_state()
                         while tstep < 10:
                             if lstm:
-                                batch = {
+                                input_batch = {
                                     SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
                                     "state_in": state_in,
+                                    SampleBatch.SEQ_LENS: np.array([1]),
                                 }
                             else:
-                                batch = {
+                                input_batch = {
                                     SampleBatch.OBS: convert_to_torch_tensor(obs)[None]
                                 }
-                            fwd_out = module.forward_exploration(batch)
+                            fwd_out = module.forward_exploration(input_batch)
                             action = convert_to_numpy(
                                 fwd_out["action_dist"].sample().squeeze(0)
                             )
                             new_obs, reward, done, _ = env.step(action)
-                            step = {
+                            output_batch = {
                                 SampleBatch.OBS: obs,
                                 SampleBatch.NEXT_OBS: new_obs,
                                 SampleBatch.ACTIONS: action,
@@ -327,14 +331,14 @@ def test_forward_train(self):
                             }
                             if lstm:
                                 assert "state_out" in fwd_out
-                                step[f"state_in"] = state_in
+                                output_batch["state_in"] = state_in
                                 state_in = fwd_out["state_out"]
-                            batch.append(step)
+                            batches.append(output_batch)
                             obs = new_obs
                             tstep += 1
 
                         # convert the list of dicts to dict of lists
-                        batch = tree.map_structure(lambda *x: list(x), *batch)
+                        batch = tree.map_structure(lambda *x: list(x), *batches)
                         # convert dict of lists to dict of tensors
                         fwd_in = {
                             k: convert_to_torch_tensor(np.array(v))
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index bb1bf8701fa1..b6e4e27517b8 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -19,8 +19,9 @@
     FCNet,
     FCConfig,
     LSTMConfig,
-    IdentityEncoder,
+    IdentityConfig,
     LSTMEncoder,
+    ENCODER_OUT,
 )
 
 
@@ -74,14 +75,9 @@ def setup(self) -> None:
         assert self.config.pi_config, "pi_config must be provided."
         assert self.config.vf_config, "vf_config must be provided."
 
-        if self.config.shared_encoder:
-            self.shared_encoder = self.config.shared_encoder_config.build()
-            self.pi_encoder = IdentityEncoder(self.config.pi_encoder_config)
-            self.vf_encoder = IdentityEncoder(self.config.vf_encoder_config)
-        else:
-            self.shared_encoder = IdentityEncoder(self.config.shared_encoder_config)
-            self.pi_encoder = self.config.pi_encoder_config.build()
-            self.vf_encoder = self.config.vf_encoder_config.build()
+        self.shared_encoder = self.config.shared_encoder_config.build()
+        self.pi_encoder = self.config.pi_encoder_config.build()
+        self.vf_encoder = self.config.vf_encoder_config.build()
 
         self.pi = FCNet(
             input_dim=self.config.pi_encoder_config.output_dim,
@@ -121,28 +117,44 @@ def from_model_config(
         else:
             raise ValueError(f"Unsupported activation: {activation}")
 
+        obs_dim = observation_space.shape[0]
         fcnet_hiddens = model_config["fcnet_hiddens"]
         vf_share_layers = model_config["vf_share_layers"]
         free_log_std = model_config["free_log_std"]
         use_lstm = model_config["use_lstm"]
 
+        if vf_share_layers:
+            shared_encoder_config = FCConfig(
+                input_dim=obs_dim,
+                hidden_layers=fcnet_hiddens,
+                activation=activation,
+                output_dim=model_config["fcnet_hiddens"][-1],
+            )
+        else:
+            shared_encoder_config = IdentityConfig(output_dim=obs_dim)
+
         if use_lstm:
-            assert vf_share_layers, "LSTM not supported with vf_share_layers=False"
-            shared_encoder_config = LSTMConfig(
+            pi_encoder_config = LSTMConfig(
+                input_dim=shared_encoder_config.output_dim,
                 hidden_dim=model_config["lstm_cell_size"],
                 batch_first=not model_config["_time_major"],
                 output_dim=model_config["lstm_cell_size"],
                 num_layers=1,
             )
         else:
-            shared_encoder_config = FCConfig(
+            pi_encoder_config = FCConfig(
+                input_dim=shared_encoder_config.output_dim,
                 hidden_layers=fcnet_hiddens,
                 activation=activation,
                 output_dim=model_config["fcnet_hiddens"][-1],
             )
 
-        pi_encoder_config = FCConfig()
-        vf_encoder_config = FCConfig()
+        vf_encoder_config = FCConfig(
+            input_dim=shared_encoder_config.output_dim,
+            hidden_layers=fcnet_hiddens,
+            activation=activation,
+            output_dim=model_config["fcnet_hiddens"][-1],
+        )
         pi_config = FCConfig()
         vf_config = FCConfig()
 
@@ -161,14 +173,15 @@ def from_model_config(
         # build pi network
         shared_encoder_config.input_dim = observation_space.shape[0]
         pi_encoder_config.input_dim = shared_encoder_config.output_dim
-
+        pi_config.input_dim = pi_encoder_config.output_dim
         if isinstance(action_space, gym.spaces.Discrete):
             pi_config.output_dim = action_space.n
         else:
             pi_config.output_dim = action_space.shape[0] * 2
 
         # build vf network
-        vf_config.input_dim = shared_encoder_config.output_dim
+        vf_encoder_config.input_dim = shared_encoder_config.output_dim
+        vf_config.input_dim = vf_encoder_config.output_dim
         vf_config.output_dim = 1
 
         config_ = PPOModuleConfig(
@@ -206,9 +219,10 @@ def output_specs_inference(self) -> ModelSpec:
 
     @override(RLModule)
     def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]:
-        encoder_out = self.shared_encoder(batch)
-        encoder_out_pi = self.pi_encoder(encoder_out)
-        action_logits = self.pi(encoder_out_pi["embedding"])
+        shared_enc_out = self.shared_encoder(batch)
+        pi_enc_out = self.pi_encoder(shared_enc_out)
+
+        action_logits = self.pi(pi_enc_out[ENCODER_OUT])
 
         if self._is_discrete:
             action = torch.argmax(action_logits, dim=-1)
@@ -217,7 +231,7 @@ def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]:
 
         action_dist = TorchDeterministic(action)
         output = {SampleBatch.ACTION_DIST: action_dist}
-        output["state_out"] = encoder_out_pi.get("state_out", {})
+        output["state_out"] = pi_enc_out.get("state_out", {})
         return output
 
     @override(RLModule)
@@ -250,7 +264,7 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
         encoder_out = self.shared_encoder(batch)
         encoder_out_pi = self.pi_encoder(encoder_out)
         encoder_out_vf = self.vf_encoder(encoder_out)
-        action_logits = self.pi(encoder_out_pi["embedding"])
+        action_logits = self.pi(encoder_out_pi[ENCODER_OUT])
 
         output = {}
         if self._is_discrete:
@@ -264,7 +278,7 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
         output[SampleBatch.ACTION_DIST] = action_dist
 
         # compute the value function
-        output[SampleBatch.VF_PREDS] = self.vf(encoder_out_vf["embedding"]).squeeze(-1)
+        output[SampleBatch.VF_PREDS] = self.vf(encoder_out_vf[ENCODER_OUT]).squeeze(-1)
         output["state_out"] = encoder_out_pi.get("state_out", {})
         return output
 
@@ -301,8 +315,8 @@ def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]:
         encoder_out_pi = self.pi_encoder(encoder_out)
         encoder_out_vf = self.vf_encoder(encoder_out)
 
-        action_logits = self.pi(encoder_out_pi["embedding"])
-        vf = self.vf(encoder_out_vf["embedding"])
+        action_logits = self.pi(encoder_out_pi[ENCODER_OUT])
+        vf = self.vf(encoder_out_vf[ENCODER_OUT])
 
         if self._is_discrete:
             action_dist = TorchCategorical(logits=action_logits)
diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index d99a1a9e7c76..1c4d6232d55c 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -13,6 +13,9 @@
 
 # TODO (Kourosh): Find a better / more straight fwd approach for sub-components
 
+ENCODER_OUT = "encoder_out"
+STATE_IN = "state_in"
+
 
 @dataclass
 class EncoderConfig:
@@ -26,6 +29,14 @@ class EncoderConfig:
     output_dim: int = None
 
 
+@dataclass
+class IdentityConfig(EncoderConfig):
+    """Configuration for an identity encoder."""
+
+    def build(self):
+        return IdentityEncoder(self)
+
+
 @dataclass
 class FCConfig(EncoderConfig):
     """Configuration for a fully connected network.
@@ -97,11 +108,11 @@ def input_spec(self):
 
     def output_spec(self):
         return ModelSpec(
-            {"embedding": TorchTensorSpec("b, h", h=self.config.output_dim)}
+            {ENCODER_OUT: TorchTensorSpec("b, h", h=self.config.output_dim)}
         )
 
     def _forward(self, input_dict):
-        return {"embedding": self.net(input_dict[SampleBatch.OBS])}
+        return {ENCODER_OUT: self.net(input_dict[SampleBatch.OBS])}
 
 
 class LSTMEncoder(Encoder):
@@ -129,7 +140,7 @@ def input_spec(self):
             {
                 # bxt is just a name for better readability to indicated padded batch
                 SampleBatch.OBS: TorchTensorSpec("bxt, h", h=config.input_dim),
-                "state_in": {
+                STATE_IN: {
                     "h": TorchTensorSpec(
                         "b, l, h", h=config.hidden_dim, l=config.num_layers
                     ),
@@ -144,7 +155,7 @@ def output_spec(self):
         config = self.config
         return ModelSpec(
             {
-                "embedding": TorchTensorSpec("bxt, h", h=config.output_dim),
+                ENCODER_OUT: TorchTensorSpec("bxt, h", h=config.output_dim),
                 "state_out": {
                     "h": TorchTensorSpec("b, h", h=config.hidden_dim),
                     "c": TorchTensorSpec("b, h", h=config.hidden_dim),
@@ -154,7 +165,7 @@ def output_spec(self):
 
     def forward(self, input_dict: SampleBatch):
         x = input_dict[SampleBatch.OBS]
-        states = input_dict["state_in"]
+        states = input_dict[STATE_IN]
         # states are batch-first when coming in
         states = tree.map_structure(lambda x: x.transpose(0, 1), states)
 
@@ -171,7 +182,7 @@ def forward(self, input_dict: SampleBatch):
         x = x.view(-1, x.shape[-1])
 
         return {
-            "embedding": x,
+            ENCODER_OUT: x,
             "state_out": tree.map_structure(lambda x: x.transpose(0, 1), states_o),
         }
 
@@ -180,11 +191,5 @@ class IdentityEncoder(Encoder):
     def __init__(self, config: EncoderConfig) -> None:
         super().__init__(config)
 
-    def input_spec(self):
-        return ModelSpec()
-
-    def output_spec(self):
-        return ModelSpec()
-
     def _forward(self, input_dict):
         return input_dict

From ddf8596d83e052d3569006327dd4840f385fdf6f Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Mon, 19 Dec 2022 15:41:09 +0100
Subject: [PATCH 06/14] add lstm code

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_with_rl_module.py      | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index c19b34dde06b..d56701184e20 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -3,6 +3,7 @@
 import numpy as np
 import tree
 import gym
+import torch
 
 import ray
 import ray.rllib.algorithms.ppo as ppo
@@ -255,8 +256,11 @@ def test_rollouts(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    # TODO: LSTM = True
-                    for lstm in [False]:
+                    for lstm in [True, False]:
+                        if lstm and shared_encoder:
+                            # Not yet implemented
+                            # TODO (Artur): Implement
+                            continue
                         print(
                             f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
                             f"{shared_encoder}] | LSTM={lstm}"
@@ -268,15 +272,17 @@ def test_rollouts(self):
 
                         obs = env.reset()
 
+                        batch = {
+                            SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
+                        }
+
                         if lstm:
-                            batch = {
-                                SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
-                                "state_in": module.pi_encoder.get_inital_state(),
-                            }
-                        else:
-                            batch = {
-                                SampleBatch.OBS: convert_to_torch_tensor(obs)[None]
-                            }
+                            state_in = module.pi_encoder.get_inital_state()
+                            state_in = tree.map_structure(
+                                lambda x: x[None], convert_to_torch_tensor(state_in)
+                            )
+                            batch["state_in"] = state_in
+                            batch["seq_lens"] = torch.Tensor([1])
 
                         if fwd_fn == "forward_exploration":
                             module.forward_exploration(batch)
@@ -288,8 +294,11 @@ def test_forward_train(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    # TODO: LSTM = True
-                    for lstm in [False]:
+                    for lstm in [True, False]:
+                        if lstm and shared_encoder:
+                            # Not yet implemented
+                            # TODO (Artur): Implement
+                            continue
                         print(
                             f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
                             f"{shared_encoder}] | LSTM={lstm}"
@@ -306,6 +315,10 @@ def test_forward_train(self):
                         if lstm:
                             # TODO (Artur): Multiple states
                             state_in = module.pi_encoder.get_inital_state()
+                            state_in = tree.map_structure(
+                                lambda x: x[None], convert_to_torch_tensor(state_in)
+                            )
+                            output_states = state_in
                         while tstep < 10:
                             if lstm:
                                 input_batch = {
@@ -331,7 +344,12 @@ def test_forward_train(self):
                             }
                             if lstm:
                                 assert "state_out" in fwd_out
-                                output_batch["state_in"] = state_in
+                                if tstep > 0:  # First states are already added
+                                    output_states = tree.map_structure(
+                                        lambda *s: torch.cat((s[0], s[1])),
+                                        output_states,
+                                        state_in,
+                                    )
                                 state_in = fwd_out["state_out"]
                             batches.append(output_batch)
                             obs = new_obs
@@ -344,10 +362,13 @@ def test_forward_train(self):
                             k: convert_to_torch_tensor(np.array(v))
                             for k, v in batch.items()
                         }
+                        if lstm:
+                            fwd_in["state_in"] = output_states
+                            fwd_in[SampleBatch.SEQ_LENS] = torch.Tensor([1] * 10)
 
                         # forward train
-                        # before training make sure it's on the right device and it's on
-                        # trianing mode
+                        # before training make sure module is on the right device and in
+                        # training mode
                         module.to("cpu")
                         module.train()
                         fwd_out = module.forward_train(fwd_in)

From 269a5dd0626182f8ac65aca2f5f311aea6da9364 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Mon, 19 Dec 2022 16:05:10 +0100
Subject: [PATCH 07/14] add underscore to forward method

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 rllib/core/rl_module/encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index 1c4d6232d55c..7793b3d2c7d2 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -163,7 +163,7 @@ def output_spec(self):
             }
         )
 
-    def forward(self, input_dict: SampleBatch):
+    def _forward(self, input_dict: SampleBatch):
         x = input_dict[SampleBatch.OBS]
         states = input_dict[STATE_IN]
         # states are batch-first when coming in

From b38d7cbf3679f011e35f2c7139f92817d6d6f83b Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Tue, 20 Dec 2022 19:06:19 +0100
Subject: [PATCH 08/14] initial

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 rllib/BUILD                                   |  9 ++-
 .../ppo/tests/test_ppo_with_rl_module.py      | 42 ++++++-------
 .../ppo/torch/ppo_torch_rl_module.py          | 60 +++++++++++--------
 rllib/core/rl_module/encoder.py               | 24 ++------
 rllib/core/rl_module/rl_module.py             | 16 +++--
 rllib/examples/cartpole_lstm.py               | 24 +++-----
 rllib/models/base_model.py                    |  2 +-
 rllib/models/configs/base.py                  | 31 ++++++++++
 rllib/models/configs/encoder.py               | 47 +++------------
 rllib/models/configs/identity.py              | 13 ++++
 ...ctor_encoder.py => test_vector_encoder.py} |  0
 rllib/models/torch/encoders/vector.py         |  4 +-
 rllib/models/torch/identity.py                | 21 +++++++
 rllib/models/torch/model.py                   | 11 +++-
 rllib/models/torch/tests/test_identity.py     | 31 ++++++++++
 15 files changed, 201 insertions(+), 134 deletions(-)
 create mode 100644 rllib/models/configs/base.py
 create mode 100644 rllib/models/configs/identity.py
 rename rllib/models/torch/encoders/tests/{test_torch_vector_encoder.py => test_vector_encoder.py} (100%)
 create mode 100644 rllib/models/torch/identity.py
 create mode 100644 rllib/models/torch/tests/test_identity.py

diff --git a/rllib/BUILD b/rllib/BUILD
index 92b8884efea4..76bf7bec3c16 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1927,9 +1927,16 @@ py_test(
     name = "test_torch_vector_encoder",
     tags = ["team:rllib", "models"],
     size = "small",
-    srcs = ["models/torch/encoders/tests/test_torch_vector_encoder.py"]
+    srcs = ["models/torch/encoders/tests/test_vector_encoder.py"]
 )
 
+# test TorchIdentity
+py_test(
+    name = "test_torch_identity",
+    tags = ["team:rllib", "models"],
+    size = "small",
+    srcs = ["models/torch/tests/test_identity.py"]
+)
 
 # --------------------------------------------------------------------
 # Offline
diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index d56701184e20..93aeee1b9a4d 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -14,10 +14,10 @@
     get_ppo_loss,
 )
 from ray.rllib.core.rl_module.encoder import (
-    IdentityConfig,
-    FCConfig,
     LSTMConfig,
 )
+from rllib.models.configs.identity import IdentityConfig
+from ray.rllib.models.configs.encoder import VectorEncoderConfig
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
@@ -38,16 +38,13 @@ def get_expected_model_config(env, lstm, shared_encoder):
 
     if shared_encoder:
         assert not lstm, "LSTM can only be used in PI"
-        shared_encoder_config = FCConfig(
-            input_dim=obs_dim,
-            hidden_layers=[32],
-            activation="ReLU",
-            output_dim=32,
+        shared_encoder_config = VectorEncoderConfig(
+            hidden_layer_sizes=[32],
         )
-        pi_encoder_config = IdentityConfig(output_dim=32)
-        vf_encoder_config = IdentityConfig(output_dim=32)
+        pi_encoder_config = IdentityConfig()
+        vf_encoder_config = IdentityConfig()
     else:
-        shared_encoder_config = IdentityConfig(output_dim=obs_dim)
+        shared_encoder_config = IdentityConfig()
         if lstm:
             pi_encoder_config = LSTMConfig(
                 input_dim=obs_dim,
@@ -57,21 +54,16 @@ def get_expected_model_config(env, lstm, shared_encoder):
                 num_layers=1,
             )
         else:
-            pi_encoder_config = FCConfig(
-                input_dim=obs_dim,
-                output_dim=32,
-                hidden_layers=[32],
-                activation="ReLU",
+            pi_encoder_config = VectorEncoderConfig(
+                hidden_layer_sizes=[32],
             )
-        vf_encoder_config = FCConfig(
-            input_dim=obs_dim,
-            output_dim=32,
-            hidden_layers=[32],
-            activation="ReLU",
+        vf_encoder_config = VectorEncoderConfig(
+            hidden_layer_sizes=[32],
         )
 
-    pi_config = FCConfig()
-    vf_config = FCConfig()
+    pi_config = VectorEncoderConfig(hidden_layer_sizes=[32])
+
+    vf_config = VectorEncoderConfig(hidden_layer_sizes=[32])
 
     if isinstance(env.action_space, gym.spaces.Discrete):
         pi_config.output_dim = env.action_space.n
@@ -256,7 +248,8 @@ def test_rollouts(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    for lstm in [True, False]:
+                    # TODO (Artur) Reinable LSTM
+                    for lstm in [False]:
                         if lstm and shared_encoder:
                             # Not yet implemented
                             # TODO (Artur): Implement
@@ -294,7 +287,8 @@ def test_forward_train(self):
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
             for fwd_fn in ["forward_exploration", "forward_inference"]:
                 for shared_encoder in [False, True]:
-                    for lstm in [True, False]:
+                    for lstm in [False]:
+                        # TODO (Artur) Reinable LSTM
                         if lstm and shared_encoder:
                             # Not yet implemented
                             # TODO (Artur): Implement
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index b6e4e27517b8..59fcae6850a6 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -1,3 +1,4 @@
+import copy
 from dataclasses import dataclass
 import gym
 from typing import Mapping, Any, Union
@@ -15,15 +16,14 @@
     TorchDeterministic,
     TorchDiagGaussian,
 )
+from ray.rllib.models.configs.base import ModelConfig
 from ray.rllib.core.rl_module.encoder import (
-    FCNet,
     FCConfig,
     LSTMConfig,
-    IdentityConfig,
     LSTMEncoder,
     ENCODER_OUT,
 )
-
+from rllib.models.configs.identity import IdentityConfig
 
 torch, nn = try_import_torch()
 
@@ -55,11 +55,11 @@ class PPOModuleConfig(RLModuleConfig):
         shared_encoder: Whether to share the encoder between the pi and value
     """
 
-    pi_config: FCConfig = None
-    vf_config: FCConfig = None
-    pi_encoder_config: FCConfig = None
-    vf_encoder_config: FCConfig = None
-    shared_encoder_config: FCConfig = None
+    pi_config: ModelConfig = None
+    vf_config: ModelConfig = None
+    pi_encoder_config: ModelConfig = None
+    vf_encoder_config: ModelConfig = None
+    shared_encoder_config: ModelConfig = None
     free_log_std: bool = False
     shared_encoder: bool = True
 
@@ -75,23 +75,27 @@ def setup(self) -> None:
         assert self.config.pi_config, "pi_config must be provided."
         assert self.config.vf_config, "vf_config must be provided."
 
-        self.shared_encoder = self.config.shared_encoder_config.build()
-        self.pi_encoder = self.config.pi_encoder_config.build()
-        self.vf_encoder = self.config.vf_encoder_config.build()
-
-        self.pi = FCNet(
-            input_dim=self.config.pi_encoder_config.output_dim,
-            output_dim=self.config.pi_config.output_dim,
-            hidden_layers=self.config.pi_config.hidden_layers,
-            activation=self.config.pi_config.activation,
+        obs_spec = ModelSpec(
+            {  # bxt is just a name for better readability to indicated padded batch
+                SampleBatch.OBS: TorchTensorSpec(
+                    "bxt, h", h=self.config.observation_space.shape[0]
+                )
+            }
         )
 
-        self.vf = FCNet(
-            input_dim=self.config.vf_encoder_config.output_dim,
-            output_dim=1,
-            hidden_layers=self.config.vf_config.hidden_layers,
-            activation=self.config.vf_config.activation,
+        self.shared_encoder = self.config.shared_encoder_config.build(
+            input_spec=obs_spec
+        )
+        self.pi_encoder = self.config.pi_encoder_config.build(
+            self.shared_encoder.output_spec
         )
+        self.vf_encoder = self.config.vf_encoder_config.build(
+            self.shared_encoder.output_spec
+        )
+
+        self.pi = self.config.pi_encoder_config.build(self.pi_encoder.output_spec)
+
+        self.vf = self.config.pi_encoder_config.build(self.vf_encoder.output_spec)
 
         self._is_discrete = isinstance(self.config.action_space, gym.spaces.Discrete)
 
@@ -209,10 +213,12 @@ def get_initial_state(self) -> NestedDict:
                 return self.pi_encoder.get_inital_state()
         return {}
 
+    @property
     @override(RLModule)
     def input_specs_inference(self) -> ModelSpec:
-        return self.input_specs_exploration()
+        return self.input_specs_exploration
 
+    @property
     @override(RLModule)
     def output_specs_inference(self) -> ModelSpec:
         return ModelSpec({SampleBatch.ACTION_DIST: TorchDeterministic})
@@ -234,10 +240,12 @@ def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]:
         output["state_out"] = pi_enc_out.get("state_out", {})
         return output
 
+    @property
     @override(RLModule)
     def input_specs_exploration(self):
-        return self.shared_encoder.input_spec()
+        return self.shared_encoder.input_spec
 
+    @property
     @override(RLModule)
     def output_specs_exploration(self) -> ModelSpec:
         specs = {SampleBatch.ACTION_DIST: self.__get_action_dist_type()}
@@ -282,6 +290,7 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
         output["state_out"] = encoder_out_pi.get("state_out", {})
         return output
 
+    @property
     @override(RLModule)
     def input_specs_train(self) -> ModelSpec:
         if self._is_discrete:
@@ -290,13 +299,14 @@ def input_specs_train(self) -> ModelSpec:
             action_dim = self.config.action_space.shape[0]
             action_spec = TorchTensorSpec("b, h", h=action_dim)
 
-        spec_dict = self.shared_encoder.input_spec()
+        spec_dict = copy.deepcopy(self.shared_encoder.input_spec)
         spec_dict.update({SampleBatch.ACTIONS: action_spec})
         if SampleBatch.OBS in spec_dict:
             spec_dict[SampleBatch.NEXT_OBS] = spec_dict[SampleBatch.OBS]
         spec = ModelSpec(spec_dict)
         return spec
 
+    @property
     @override(RLModule)
     def output_specs_train(self) -> ModelSpec:
         spec = ModelSpec(
diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index 7793b3d2c7d2..820b305a2ed9 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -29,14 +29,6 @@ class EncoderConfig:
     output_dim: int = None
 
 
-@dataclass
-class IdentityConfig(EncoderConfig):
-    """Configuration for an identity encoder."""
-
-    def build(self):
-        return IdentityEncoder(self)
-
-
 @dataclass
 class FCConfig(EncoderConfig):
     """Configuration for a fully connected network.
@@ -70,15 +62,17 @@ class Encoder(nn.Module):
     def __init__(self, config: EncoderConfig) -> None:
         super().__init__()
         self.config = config
-        self._input_spec = self.input_spec()
-        self._output_spec = self.output_spec()
+        self._input_spec = self.input_spec
+        self._output_spec = self.output_spec
 
     def get_inital_state(self):
         return []
 
+    @property
     def input_spec(self):
         return ModelSpec()
 
+    @property
     def output_spec(self):
         return ModelSpec()
 
@@ -101,11 +95,13 @@ def __init__(self, config: FCConfig) -> None:
             activation=config.activation,
         )
 
+    @property
     def input_spec(self):
         return ModelSpec(
             {SampleBatch.OBS: TorchTensorSpec("b, h", h=self.config.input_dim)}
         )
 
+    @property
     def output_spec(self):
         return ModelSpec(
             {ENCODER_OUT: TorchTensorSpec("b, h", h=self.config.output_dim)}
@@ -185,11 +181,3 @@ def _forward(self, input_dict: SampleBatch):
             ENCODER_OUT: x,
             "state_out": tree.map_structure(lambda x: x.transpose(0, 1), states_o),
         }
-
-
-class IdentityEncoder(Encoder):
-    def __init__(self, config: EncoderConfig) -> None:
-        super().__init__(config)
-
-    def _forward(self, input_dict):
-        return input_dict
diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py
index 3e2ca5547955..56ee7cec867c 100644
--- a/rllib/core/rl_module/rl_module.py
+++ b/rllib/core/rl_module/rl_module.py
@@ -123,12 +123,12 @@ def __post_init__(self):
         This is a good place to do any initialization that requires access to the
         subclass's attributes.
         """
-        self._input_specs_train = self.input_specs_train()
-        self._output_specs_train = self.output_specs_train()
-        self._input_specs_exploration = self.input_specs_exploration()
-        self._output_specs_exploration = self.output_specs_exploration()
-        self._input_specs_inference = self.input_specs_inference()
-        self._output_specs_inference = self.output_specs_inference()
+        self._input_specs_train = self.input_specs_train
+        self._output_specs_train = self.output_specs_train
+        self._input_specs_exploration = self.input_specs_exploration
+        self._output_specs_exploration = self.output_specs_exploration
+        self._input_specs_inference = self.input_specs_inference
+        self._output_specs_inference = self.output_specs_inference
 
     @classmethod
     def from_model_config(
@@ -209,18 +209,22 @@ def output_specs_exploration(self) -> ModelSpec:
         """
         return ModelSpec({"action_dist": Distribution})
 
+    @property
     def output_specs_train(self) -> ModelSpec:
         """Returns the output specs of the forward_train method."""
         return ModelSpec()
 
+    @property
     def input_specs_inference(self) -> ModelSpec:
         """Returns the input specs of the forward_inference method."""
         return ModelSpec()
 
+    @property
     def input_specs_exploration(self) -> ModelSpec:
         """Returns the input specs of the forward_exploration method."""
         return ModelSpec()
 
+    @property
     def input_specs_train(self) -> ModelSpec:
         """Returns the input specs of the forward_train method."""
         return ModelSpec()
diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py
index c14ed8d9ec36..e7dc14beb2b4 100644
--- a/rllib/examples/cartpole_lstm.py
+++ b/rllib/examples/cartpole_lstm.py
@@ -1,5 +1,4 @@
 import argparse
-import os
 
 from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
 from ray.rllib.utils.test_utils import check_learning_achieved
@@ -12,7 +11,7 @@
 parser.add_argument(
     "--framework",
     choices=["tf", "tf2", "torch"],
-    default="tf",
+    default="torch",
     help="The DL framework specifier.",
 )
 parser.add_argument("--eager-tracing", action="store_true")
@@ -50,11 +49,6 @@
             },
             "vf_loss_coeff": 0.0001,
         },
-        "IMPALA": {
-            "num_workers": 2,
-            "num_gpus": 0,
-            "vf_loss_coeff": 0.01,
-        },
     }
 
     config = dict(
@@ -62,13 +56,6 @@
         **{
             "env": StatelessCartPole,
             # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
-            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
-            "model": {
-                "use_lstm": True,
-                "lstm_cell_size": 256,
-                "lstm_use_prev_action": args.use_prev_action,
-                "lstm_use_prev_reward": args.use_prev_reward,
-            },
             "framework": args.framework,
             # Run with tracing enabled for tf2?
             "eager_tracing": args.eager_tracing,
@@ -115,7 +102,14 @@
     # >>         prev_r = reward
 
     tuner = tune.Tuner(
-        args.run, param_space=config, run_config=air.RunConfig(stop=stop, verbose=2)
+        args.run,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop={"num_env_steps_trained": 1},
+            checkpoint_config=air.CheckpointConfig(
+                checkpoint_at_end=True,
+            ),
+        ),
     )
     results = tuner.fit()
 
diff --git a/rllib/models/base_model.py b/rllib/models/base_model.py
index c006af27f6f1..d6784ed64f3e 100644
--- a/rllib/models/base_model.py
+++ b/rllib/models/base_model.py
@@ -205,7 +205,7 @@ class Model(RecurrentModel):
     """A RecurrentModel made non-recurrent by ignoring
     the input/output states.
 
-    As a convienience, users may override _forward instead of _unroll,
+    As a convenience, users may override _forward instead of _unroll,
     which hides model states.
 
     Args:
diff --git a/rllib/models/configs/base.py b/rllib/models/configs/base.py
new file mode 100644
index 000000000000..7b59ac59aff5
--- /dev/null
+++ b/rllib/models/configs/base.py
@@ -0,0 +1,31 @@
+import abc
+from dataclasses import dataclass
+
+from ray.rllib.models.specs.specs_dict import ModelSpec
+
+
+@dataclass
+class ModelConfig:
+    """The base config for models.
+
+    Each config should define a `build` method that builds a model from the config.
+
+    All user-configurable parameters known before runtime
+    (e.g. framework, activation, num layers, etc.) should be defined as attributes.
+
+    Parameters unknown before runtime (e.g. the output size of the module providing
+    input for this module) should be passed as arguments to `build`. This should be
+    as few params as possible.
+
+    `build` should return an instance of the model associated with the config.
+
+    Attributes:
+        framework_str: The tensor framework to construct a model for.
+            This can be 'torch', 'tf2', or 'jax'.
+    """
+
+    framework_str: str = "torch"
+
+    @abc.abstractmethod
+    def build(self, input_spec: ModelSpec, **kwargs) -> "Encoder":
+        """Builds the ModelConfig into an Encoder instance"""
diff --git a/rllib/models/configs/encoder.py b/rllib/models/configs/encoder.py
index 07d87e6f7df2..903eac4d0fde 100644
--- a/rllib/models/configs/encoder.py
+++ b/rllib/models/configs/encoder.py
@@ -1,43 +1,16 @@
-import abc
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Tuple
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List
 
+from ray.rllib.models.configs.base import ModelConfig
 from ray.rllib.models.specs.specs_dict import ModelSpec
 from ray.rllib.models.torch.encoders.vector import TorchVectorEncoder
 
 if TYPE_CHECKING:
-    from ray.rllib.models.torch.encoders.vector import Encoder
+    pass
 
 
 @dataclass
-class EncoderConfig:
-    """The base config for encoder models.
-
-    Each config should define a `build` method that builds a model from the config.
-
-    All user-configurable parameters known before runtime
-    (e.g. framework, activation, num layers, etc.) should be defined as attributes.
-
-    Parameters unknown before runtime (e.g. the output size of the module providing
-    input for this module) should be passed as arguments to `build`. This should be
-    as few params as possible.
-
-    `build` should return an instance of the encoder associated with the config.
-
-    Attributes:
-        framework_str: The tensor framework to construct a model for.
-            This can be 'torch', 'tf2', or 'jax'.
-    """
-
-    framework_str: str = "torch"
-
-    @abc.abstractmethod
-    def build(self, input_spec: ModelSpec, **kwargs) -> "Encoder":
-        """Builds the EncoderConfig into an Encoder instance"""
-
-
-@dataclass
-class VectorEncoderConfig(EncoderConfig):
+class VectorEncoderConfig(ModelConfig):
     """An MLP encoder mappings tensors with shape [..., feature] to [..., output].
 
     Attributes:
@@ -45,7 +18,7 @@ class VectorEncoderConfig(EncoderConfig):
             Options are 'relu', 'swish', 'tanh', or 'linear'
         final_activation: The activation function to use after the final linear layer.
             Options are the same as for activation.
-        hidden_layer_sizes: A list, where each element represents the number of neurons
+        hidden_layer_sizes: A tuple, where each element represents the number of neurons
             in that layer. For example, [128, 64] would produce a two-layer MLP with
             128 hidden neurons and 64 hidden neurons.
         output_key: Write the output of the encoder to this key in the NestedDict.
@@ -53,8 +26,8 @@ class VectorEncoderConfig(EncoderConfig):
 
     activation: str = "relu"
     final_activation: str = "linear"
-    hidden_layer_sizes: Tuple[int, ...] = (128, 128)
-    output_key: str = "encoding"
+    hidden_layer_sizes: List[int] = field(default_factory=lambda: list((128, 128)))
+    output_key: str = "embedding"
 
     def build(self, input_spec: ModelSpec) -> TorchVectorEncoder:
         """Build the config into a VectorEncoder model instance.
@@ -66,9 +39,7 @@ def build(self, input_spec: ModelSpec) -> TorchVectorEncoder:
         Returns:
             A VectorEncoder of the specified framework.
         """
-        assert (
-            len(self.hidden_layer_sizes) > 1
-        ), "Must have at least a single hidden layer"
+        assert len(self.hidden_layer_sizes) > 0, "Must have at least a single layer"
         for k in input_spec.shallow_keys():
             assert isinstance(
                 input_spec[k].shape[-1], int
diff --git a/rllib/models/configs/identity.py b/rllib/models/configs/identity.py
new file mode 100644
index 000000000000..0fa788fd20df
--- /dev/null
+++ b/rllib/models/configs/identity.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+
+from ray.rllib.core.rl_module.encoder import EncoderConfig
+from rllib.models.torch.identity import Identity
+from ray.rllib.models.specs.specs_dict import ModelSpec
+
+
+@dataclass
+class IdentityConfig(EncoderConfig):
+    """Configuration for an identity encoder."""
+
+    def build(self, input_spec: ModelSpec) -> Identity:
+        return Identity(input_spec=input_spec, config=self)
diff --git a/rllib/models/torch/encoders/tests/test_torch_vector_encoder.py b/rllib/models/torch/encoders/tests/test_vector_encoder.py
similarity index 100%
rename from rllib/models/torch/encoders/tests/test_torch_vector_encoder.py
rename to rllib/models/torch/encoders/tests/test_vector_encoder.py
diff --git a/rllib/models/torch/encoders/vector.py b/rllib/models/torch/encoders/vector.py
index 2feed58dd11d..304d7b1b672f 100644
--- a/rllib/models/torch/encoders/vector.py
+++ b/rllib/models/torch/encoders/vector.py
@@ -60,9 +60,7 @@ def __init__(
             prev_size = size
 
         # Final layer
-        layers += [
-            nn.Linear(config.hidden_layer_sizes[-2], config.hidden_layer_sizes[-1])
-        ]
+        layers += [nn.Linear(prev_size, config.hidden_layer_sizes[-1])]
         if config.final_activation != "linear":
             layers += [
                 get_activation_fn(
diff --git a/rllib/models/torch/identity.py b/rllib/models/torch/identity.py
new file mode 100644
index 000000000000..cf0c29514ab4
--- /dev/null
+++ b/rllib/models/torch/identity.py
@@ -0,0 +1,21 @@
+from ray.rllib.core.rl_module.encoder import Encoder, EncoderConfig
+from ray.rllib.models.specs.specs_dict import ModelSpec
+
+
+class Identity(Encoder):
+    """Implements the identity function."""
+
+    def __init__(self, input_spec: ModelSpec, config: EncoderConfig) -> None:
+        self._input_spec = input_spec
+        super().__init__(config)
+
+    @property
+    def input_spec(self) -> ModelSpec:
+        return self._input_spec
+
+    @property
+    def output_spec(self) -> ModelSpec:
+        return self._input_spec
+
+    def _forward(self, input_dict) -> dict:
+        return input_dict
diff --git a/rllib/models/torch/model.py b/rllib/models/torch/model.py
index 30f341d63238..ad494e67b670 100644
--- a/rllib/models/torch/model.py
+++ b/rllib/models/torch/model.py
@@ -1,13 +1,14 @@
 import torch
-from torch import nn
 import tree
+from torch import nn
 
+from ray.rllib.models.base_model import RecurrentModel, Model, ModelIO
+from ray.rllib.models.specs.specs_dict import check_specs
+from ray.rllib.models.temp_spec_classes import TensorDict, ModelConfig
 from ray.rllib.utils.annotations import (
     DeveloperAPI,
     override,
 )
-from ray.rllib.models.temp_spec_classes import TensorDict, ModelConfig
-from ray.rllib.models.base_model import RecurrentModel, Model, ModelIO
 
 
 class TorchModelIO(ModelIO):
@@ -218,3 +219,7 @@ def __init__(self, config: ModelConfig) -> None:
         Model.__init__(self)
         nn.Module.__init__(self)
         TorchModelIO.__init__(self, config)
+
+    @check_specs(input_spec="_input_spec", output_spec="_output_spec")
+    def forward(self, input_dict):
+        return self._forward(input_dict)
diff --git a/rllib/models/torch/tests/test_identity.py b/rllib/models/torch/tests/test_identity.py
new file mode 100644
index 000000000000..51c10f31e689
--- /dev/null
+++ b/rllib/models/torch/tests/test_identity.py
@@ -0,0 +1,31 @@
+import unittest
+
+import torch
+
+from ray.rllib.models.configs.identity import IdentityConfig
+from ray.rllib.models.specs.specs_dict import ModelSpec
+from ray.rllib.models.specs.specs_torch import TorchTensorSpec
+from ray.rllib.utils.nested_dict import NestedDict
+
+
+class TestConfig(unittest.TestCase):
+    def test_build(self):
+        """Test building with the default config"""
+        input_spec = ModelSpec({"bork": TorchTensorSpec("a, b, c", c=3)})
+        c = IdentityConfig()
+        c.build(input_spec)
+
+    def test_identity(self):
+        """Test the default config/model _forward implementation"""
+        input_spec = ModelSpec({"bork": TorchTensorSpec("a, b, c", c=3)})
+        c = IdentityConfig()
+        m = c.build(input_spec)
+        inputs = NestedDict({"bork": torch.rand((2, 4, 3))})
+        self.assertEqual(m(inputs), inputs)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))

From 89ebc6a67d4cdb2d1ce07f405a793db9110e516c Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Tue, 20 Dec 2022 21:46:44 +0100
Subject: [PATCH 09/14] wip

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 rllib/algorithms/ppo/torch/ppo_torch_rl_module.py | 7 +++++--
 rllib/core/rl_module/encoder.py                   | 2 +-
 rllib/models/torch/model.py                       | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index 59fcae6850a6..557c50a54bd9 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -272,7 +272,8 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
         encoder_out = self.shared_encoder(batch)
         encoder_out_pi = self.pi_encoder(encoder_out)
         encoder_out_vf = self.vf_encoder(encoder_out)
-        action_logits = self.pi(encoder_out_pi[ENCODER_OUT])
+        pi_out = self.pi(encoder_out_pi)
+        action_logits = pi_out[ENCODER_OUT]
 
         output = {}
         if self._is_discrete:
@@ -286,7 +287,9 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
         output[SampleBatch.ACTION_DIST] = action_dist
 
         # compute the value function
-        output[SampleBatch.VF_PREDS] = self.vf(encoder_out_vf[ENCODER_OUT]).squeeze(-1)
+        vf_out = self.vf(encoder_out_vf)
+        output[SampleBatch.VF_PREDS] = vf_out[ENCODER_OUT].squeeze(-1)
+
         output["state_out"] = encoder_out_pi.get("state_out", {})
         return output
 
diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index 820b305a2ed9..2f79d836eedd 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -13,7 +13,7 @@
 
 # TODO (Kourosh): Find a better / more straight fwd approach for sub-components
 
-ENCODER_OUT = "encoder_out"
+ENCODER_OUT = "embedding"
 STATE_IN = "state_in"
 
 
diff --git a/rllib/models/torch/model.py b/rllib/models/torch/model.py
index ad494e67b670..6738c3cb2566 100644
--- a/rllib/models/torch/model.py
+++ b/rllib/models/torch/model.py
@@ -220,6 +220,6 @@ def __init__(self, config: ModelConfig) -> None:
         nn.Module.__init__(self)
         TorchModelIO.__init__(self, config)
 
-    @check_specs(input_spec="_input_spec", output_spec="_output_spec")
+    @check_specs(input_spec="_input_spec", output_spec="_output_spec", filter=True)
     def forward(self, input_dict):
         return self._forward(input_dict)

From b00b9eed137fd067e77c0eac160f5a68c0d25254 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Wed, 21 Dec 2022 10:41:19 +0100
Subject: [PATCH 10/14] better docs for get expected model config

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_with_rl_module.py          | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index d56701184e20..20ee0e072405 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -31,8 +31,19 @@
 from ray.rllib.utils.torch_utils import convert_to_torch_tensor
 
 
-# Get a model config that we expect from the catalog
-def get_expected_model_config(env, lstm, shared_encoder):
+def get_expected_model_config(env, lstm, shared_encoder) -> PPOModuleConfig:
+    """Get a PPOModuleConfig that we would expect from the catalog otherwise.
+
+    Args:
+        env: Environment for which we build the model later
+        lstm: If True, build recurrent pi encoder
+        shared_encoder: If True, build a shared encoder for pi and vf, where pi
+            encoder and vf encoder will be identity. If False, the shared encoder
+            will be identity.
+
+    Returns:
+         A PPOModuleConfig containing the relevant configs to build PPORLModule
+    """
     assert not len(env.observation_space.shape) == 3, "Implement VisionNet first!"
     obs_dim = env.observation_space.shape[0]
 

From 8157f63388acfe2f221d07170db60d1d910a5fa5 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Wed, 21 Dec 2022 11:56:19 +0100
Subject: [PATCH 11/14] kourosh's comments

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../ppo/tests/test_ppo_rl_module.py           | 227 ++++++++++++++++++
 .../ppo/tests/test_ppo_with_rl_module.py      | 216 +----------------
 2 files changed, 228 insertions(+), 215 deletions(-)
 create mode 100644 rllib/algorithms/ppo/tests/test_ppo_rl_module.py

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
new file mode 100644
index 000000000000..7adde54cfffd
--- /dev/null
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
@@ -0,0 +1,227 @@
+import ray
+import unittest
+import numpy as np
+import gym
+import torch
+import tree
+
+from ray.rllib import SampleBatch
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule, \
+    get_ppo_loss, PPOModuleConfig
+from ray.rllib.core.rl_module.encoder import FCConfig, IdentityConfig, LSTMConfig
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+
+def get_expected_model_config(env, lstm, shared_encoder) -> PPOModuleConfig:
+    """Get a PPOModuleConfig that we would expect from the catalog otherwise.
+
+    Args:
+        env: Environment for which we build the model later
+        lstm: If True, build recurrent pi encoder
+        shared_encoder: If True, build a shared encoder for pi and vf, where pi
+            encoder and vf encoder will be identity. If False, the shared encoder
+            will be identity.
+
+    Returns:
+         A PPOModuleConfig containing the relevant configs to build PPORLModule
+    """
+    assert len(env.observation_space.shape) == 1, "No multidimensional obs space " \
+                                                  "supported."
+    obs_dim = env.observation_space.shape[0]
+
+    if shared_encoder:
+        assert not lstm, "LSTM can only be used in PI"
+        shared_encoder_config = FCConfig(
+            input_dim=obs_dim,
+            hidden_layers=[32],
+            activation="ReLU",
+            output_dim=32,
+        )
+        pi_encoder_config = IdentityConfig(output_dim=32)
+        vf_encoder_config = IdentityConfig(output_dim=32)
+    else:
+        shared_encoder_config = IdentityConfig(output_dim=obs_dim)
+        if lstm:
+            pi_encoder_config = LSTMConfig(
+                input_dim=obs_dim,
+                hidden_dim=32,
+                batch_first=True,
+                output_dim=32,
+                num_layers=1,
+            )
+        else:
+            pi_encoder_config = FCConfig(
+                input_dim=obs_dim,
+                output_dim=32,
+                hidden_layers=[32],
+                activation="ReLU",
+            )
+        vf_encoder_config = FCConfig(
+            input_dim=obs_dim,
+            output_dim=32,
+            hidden_layers=[32],
+            activation="ReLU",
+        )
+
+    pi_config = FCConfig()
+    vf_config = FCConfig()
+
+    if isinstance(env.action_space, gym.spaces.Discrete):
+        pi_config.output_dim = env.action_space.n
+    else:
+        pi_config.output_dim = env.action_space.shape[0] * 2
+
+    return PPOModuleConfig(
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        shared_encoder_config=shared_encoder_config,
+        pi_encoder_config=pi_encoder_config,
+        vf_encoder_config=vf_encoder_config,
+        pi_config=pi_config,
+        vf_config=vf_config,
+        shared_encoder=shared_encoder,
+    )
+
+
+class TestPPO(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls):
+        ray.shutdown()
+
+    def test_rollouts(self):
+        # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
+        for env_name in ["CartPole-v1", "Pendulum-v1"]:
+            for fwd_fn in ["forward_exploration", "forward_inference"]:
+                for shared_encoder in [False, True]:
+                    for lstm in [True, False]:
+                        if lstm and shared_encoder:
+                            # Not yet implemented
+                            # TODO (Artur): Implement
+                            continue
+                        print(
+                            f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
+                            f"{shared_encoder}] | LSTM={lstm}"
+                        )
+                        env = gym.make(env_name)
+
+                        config = get_expected_model_config(env, lstm, shared_encoder)
+                        module = PPOTorchRLModule(config)
+
+                        obs = env.reset()
+
+                        batch = {
+                            SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
+                        }
+
+                        if lstm:
+                            state_in = module.pi_encoder.get_inital_state()
+                            state_in = tree.map_structure(
+                                lambda x: x[None], convert_to_torch_tensor(state_in)
+                            )
+                            batch["state_in"] = state_in
+                            batch["seq_lens"] = torch.Tensor([1])
+
+                        if fwd_fn == "forward_exploration":
+                            module.forward_exploration(batch)
+                        elif fwd_fn == "forward_inference":
+                            module.forward_inference(batch)
+
+
+    def test_forward_train(self):
+        # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
+        for env_name in ["CartPole-v1", "Pendulum-v1"]:
+            for fwd_fn in ["forward_exploration", "forward_inference"]:
+                for shared_encoder in [False, True]:
+                    for lstm in [True, False]:
+                        if lstm and shared_encoder:
+                            # Not yet implemented
+                            # TODO (Artur): Implement
+                            continue
+                        print(
+                            f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
+                            f"{shared_encoder}] | LSTM={lstm}"
+                        )
+                        env = gym.make(env_name)
+
+                        config = get_expected_model_config(env, lstm, shared_encoder)
+                        module = PPOTorchRLModule(config)
+
+                        # collect a batch of data
+                        batches = []
+                        obs = env.reset()
+                        tstep = 0
+                        if lstm:
+                            # TODO (Artur): Multiple states
+                            state_in = module.pi_encoder.get_inital_state()
+                            state_in = tree.map_structure(
+                                lambda x: x[None], convert_to_torch_tensor(state_in)
+                            )
+                            output_states = state_in
+                        while tstep < 10:
+                            if lstm:
+                                input_batch = {
+                                    SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
+                                    "state_in": state_in,
+                                    SampleBatch.SEQ_LENS: np.array([1]),
+                                }
+                            else:
+                                input_batch = {
+                                    SampleBatch.OBS: convert_to_torch_tensor(obs)[None]
+                                }
+                            fwd_out = module.forward_exploration(input_batch)
+                            action = convert_to_numpy(
+                                fwd_out["action_dist"].sample().squeeze(0)
+                            )
+                            new_obs, reward, done, _ = env.step(action)
+                            output_batch = {
+                                SampleBatch.OBS: obs,
+                                SampleBatch.NEXT_OBS: new_obs,
+                                SampleBatch.ACTIONS: action,
+                                SampleBatch.REWARDS: np.array(reward),
+                                SampleBatch.DONES: np.array(done),
+                            }
+                            if lstm:
+                                assert "state_out" in fwd_out
+                                if tstep > 0:  # First states are already added
+
+                                    # Extend nested batches of states
+                                    output_states = tree.map_structure(
+                                        lambda *s: torch.cat((s[0], s[1])),
+                                        output_states,
+                                        state_in,
+                                    )
+                                state_in = fwd_out["state_out"]
+                            batches.append(output_batch)
+                            obs = new_obs
+                            tstep += 1
+
+                        # convert the list of dicts to dict of lists
+                        batch = tree.map_structure(lambda *x: list(x), *batches)
+                        # convert dict of lists to dict of tensors
+                        fwd_in = {
+                            k: convert_to_torch_tensor(np.array(v))
+                            for k, v in batch.items()
+                        }
+                        if lstm:
+                            fwd_in["state_in"] = output_states
+                            fwd_in[SampleBatch.SEQ_LENS] = torch.Tensor([1] * 10)
+
+                        # forward train
+                        # before training make sure module is on the right device and in
+                        # training mode
+                        module.to("cpu")
+                        module.train()
+                        fwd_out = module.forward_train(fwd_in)
+                        loss = get_ppo_loss(fwd_in, fwd_out)
+                        loss.backward()
+
+                        # check that all neural net parameters have gradients
+                        for param in module.parameters():
+                            pass
+                            self.assertIsNotNone(param.grad)
+
diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
index 20ee0e072405..9f2f42c15238 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py
@@ -1,104 +1,18 @@
 import unittest
 
 import numpy as np
-import tree
-import gym
-import torch
 
 import ray
 import ray.rllib.algorithms.ppo as ppo
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import (
-    PPOModuleConfig,
-    PPOTorchRLModule,
-    get_ppo_loss,
-)
-from ray.rllib.core.rl_module.encoder import (
-    IdentityConfig,
-    FCConfig,
-    LSTMConfig,
-)
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
-from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.test_utils import (
     check,
     check_compute_single_action,
     check_train_results,
     framework_iterator,
 )
-from ray.rllib.utils.torch_utils import convert_to_torch_tensor
-
-
-def get_expected_model_config(env, lstm, shared_encoder) -> PPOModuleConfig:
-    """Get a PPOModuleConfig that we would expect from the catalog otherwise.
-
-    Args:
-        env: Environment for which we build the model later
-        lstm: If True, build recurrent pi encoder
-        shared_encoder: If True, build a shared encoder for pi and vf, where pi
-            encoder and vf encoder will be identity. If False, the shared encoder
-            will be identity.
-
-    Returns:
-         A PPOModuleConfig containing the relevant configs to build PPORLModule
-    """
-    assert not len(env.observation_space.shape) == 3, "Implement VisionNet first!"
-    obs_dim = env.observation_space.shape[0]
-
-    if shared_encoder:
-        assert not lstm, "LSTM can only be used in PI"
-        shared_encoder_config = FCConfig(
-            input_dim=obs_dim,
-            hidden_layers=[32],
-            activation="ReLU",
-            output_dim=32,
-        )
-        pi_encoder_config = IdentityConfig(output_dim=32)
-        vf_encoder_config = IdentityConfig(output_dim=32)
-    else:
-        shared_encoder_config = IdentityConfig(output_dim=obs_dim)
-        if lstm:
-            pi_encoder_config = LSTMConfig(
-                input_dim=obs_dim,
-                hidden_dim=32,
-                batch_first=True,
-                output_dim=32,
-                num_layers=1,
-            )
-        else:
-            pi_encoder_config = FCConfig(
-                input_dim=obs_dim,
-                output_dim=32,
-                hidden_layers=[32],
-                activation="ReLU",
-            )
-        vf_encoder_config = FCConfig(
-            input_dim=obs_dim,
-            output_dim=32,
-            hidden_layers=[32],
-            activation="ReLU",
-        )
-
-    pi_config = FCConfig()
-    vf_config = FCConfig()
-
-    if isinstance(env.action_space, gym.spaces.Discrete):
-        pi_config.output_dim = env.action_space.n
-    else:
-        pi_config.output_dim = env.action_space.shape[0] * 2
-
-    return PPOModuleConfig(
-        observation_space=env.observation_space,
-        action_space=env.action_space,
-        shared_encoder_config=shared_encoder_config,
-        pi_encoder_config=pi_encoder_config,
-        vf_encoder_config=vf_encoder_config,
-        pi_config=pi_config,
-        vf_config=vf_config,
-        shared_encoder=shared_encoder,
-    )
 
 
 class MyCallbacks(DefaultCallbacks):
@@ -137,7 +51,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs):
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(local_mode=True)
+        ray.init()
 
     @classmethod
     def tearDownClass(cls):
@@ -262,134 +176,6 @@ def test_ppo_exploration_setup(self):
             check(np.mean(actions), 1.5, atol=0.2)
             trainer.stop()
 
-    def test_rollouts(self):
-        # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
-        for env_name in ["CartPole-v1", "Pendulum-v1"]:
-            for fwd_fn in ["forward_exploration", "forward_inference"]:
-                for shared_encoder in [False, True]:
-                    for lstm in [True, False]:
-                        if lstm and shared_encoder:
-                            # Not yet implemented
-                            # TODO (Artur): Implement
-                            continue
-                        print(
-                            f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
-                            f"{shared_encoder}] | LSTM={lstm}"
-                        )
-                        env = gym.make(env_name)
-
-                        config = get_expected_model_config(env, lstm, shared_encoder)
-                        module = PPOTorchRLModule(config)
-
-                        obs = env.reset()
-
-                        batch = {
-                            SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
-                        }
-
-                        if lstm:
-                            state_in = module.pi_encoder.get_inital_state()
-                            state_in = tree.map_structure(
-                                lambda x: x[None], convert_to_torch_tensor(state_in)
-                            )
-                            batch["state_in"] = state_in
-                            batch["seq_lens"] = torch.Tensor([1])
-
-                        if fwd_fn == "forward_exploration":
-                            module.forward_exploration(batch)
-                        elif fwd_fn == "forward_inference":
-                            module.forward_inference(batch)
-
-    def test_forward_train(self):
-        # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
-        for env_name in ["CartPole-v1", "Pendulum-v1"]:
-            for fwd_fn in ["forward_exploration", "forward_inference"]:
-                for shared_encoder in [False, True]:
-                    for lstm in [True, False]:
-                        if lstm and shared_encoder:
-                            # Not yet implemented
-                            # TODO (Artur): Implement
-                            continue
-                        print(
-                            f"[ENV={env_name}] | [FWD={fwd_fn}] | [SHARED="
-                            f"{shared_encoder}] | LSTM={lstm}"
-                        )
-                        env = gym.make(env_name)
-
-                        config = get_expected_model_config(env, lstm, shared_encoder)
-                        module = PPOTorchRLModule(config)
-
-                        # collect a batch of data
-                        batches = []
-                        obs = env.reset()
-                        tstep = 0
-                        if lstm:
-                            # TODO (Artur): Multiple states
-                            state_in = module.pi_encoder.get_inital_state()
-                            state_in = tree.map_structure(
-                                lambda x: x[None], convert_to_torch_tensor(state_in)
-                            )
-                            output_states = state_in
-                        while tstep < 10:
-                            if lstm:
-                                input_batch = {
-                                    SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
-                                    "state_in": state_in,
-                                    SampleBatch.SEQ_LENS: np.array([1]),
-                                }
-                            else:
-                                input_batch = {
-                                    SampleBatch.OBS: convert_to_torch_tensor(obs)[None]
-                                }
-                            fwd_out = module.forward_exploration(input_batch)
-                            action = convert_to_numpy(
-                                fwd_out["action_dist"].sample().squeeze(0)
-                            )
-                            new_obs, reward, done, _ = env.step(action)
-                            output_batch = {
-                                SampleBatch.OBS: obs,
-                                SampleBatch.NEXT_OBS: new_obs,
-                                SampleBatch.ACTIONS: action,
-                                SampleBatch.REWARDS: np.array(reward),
-                                SampleBatch.DONES: np.array(done),
-                            }
-                            if lstm:
-                                assert "state_out" in fwd_out
-                                if tstep > 0:  # First states are already added
-                                    output_states = tree.map_structure(
-                                        lambda *s: torch.cat((s[0], s[1])),
-                                        output_states,
-                                        state_in,
-                                    )
-                                state_in = fwd_out["state_out"]
-                            batches.append(output_batch)
-                            obs = new_obs
-                            tstep += 1
-
-                        # convert the list of dicts to dict of lists
-                        batch = tree.map_structure(lambda *x: list(x), *batches)
-                        # convert dict of lists to dict of tensors
-                        fwd_in = {
-                            k: convert_to_torch_tensor(np.array(v))
-                            for k, v in batch.items()
-                        }
-                        if lstm:
-                            fwd_in["state_in"] = output_states
-                            fwd_in[SampleBatch.SEQ_LENS] = torch.Tensor([1] * 10)
-
-                        # forward train
-                        # before training make sure module is on the right device and in
-                        # training mode
-                        module.to("cpu")
-                        module.train()
-                        fwd_out = module.forward_train(fwd_in)
-                        loss = get_ppo_loss(fwd_in, fwd_out)
-                        loss.backward()
-
-                        # check that all neural net parameters have gradients
-                        for param in module.parameters():
-                            self.assertIsNotNone(param.grad)
-
 
 if __name__ == "__main__":
     import pytest

From a174623a324187626a37551bc1834b1585b0c861 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Wed, 21 Dec 2022 16:51:27 +0100
Subject: [PATCH 12/14] lstm fixed, tests working

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 rllib/BUILD                                      |  7 +++++++
 rllib/algorithms/ppo/tests/test_ppo_rl_module.py | 14 ++++++++------
 rllib/core/rl_module/encoder.py                  |  8 ++++++--
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 92b8884efea4..cbf59391c52c 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1095,6 +1095,13 @@ py_test(
     srcs = ["algorithms/ppo/tests/test_ppo_with_rl_module.py"]
 )
 
+py_test(
+    name = "test_ppo_rl_module",
+    tags = ["team:rllib", "algorithms_dir"],
+    size = "large",
+    srcs = ["algorithms/ppo/tests/test_ppo_rl_module.py"]
+)
+
 # PPO Reproducibility
 py_test(
     name = "test_repro_ppo",
diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
index 7adde54cfffd..927c0c9c7665 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
@@ -6,8 +6,11 @@
 import tree
 
 from ray.rllib import SampleBatch
-from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule, \
-    get_ppo_loss, PPOModuleConfig
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import (
+    PPOTorchRLModule,
+    get_ppo_loss,
+    PPOModuleConfig,
+)
 from ray.rllib.core.rl_module.encoder import FCConfig, IdentityConfig, LSTMConfig
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.torch_utils import convert_to_torch_tensor
@@ -26,8 +29,9 @@ def get_expected_model_config(env, lstm, shared_encoder) -> PPOModuleConfig:
     Returns:
          A PPOModuleConfig containing the relevant configs to build PPORLModule
     """
-    assert len(env.observation_space.shape) == 1, "No multidimensional obs space " \
-                                                  "supported."
+    assert len(env.observation_space.shape) == 1, (
+        "No multidimensional obs space " "supported."
+    )
     obs_dim = env.observation_space.shape[0]
 
     if shared_encoder:
@@ -131,7 +135,6 @@ def test_rollouts(self):
                         elif fwd_fn == "forward_inference":
                             module.forward_inference(batch)
 
-
     def test_forward_train(self):
         # TODO: Add BreakoutNoFrameskip-v4 to cover a 3D obs space
         for env_name in ["CartPole-v1", "Pendulum-v1"]:
@@ -224,4 +227,3 @@ def test_forward_train(self):
                         for param in module.parameters():
                             pass
                             self.assertIsNotNone(param.grad)
-
diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index 7793b3d2c7d2..729a973fb76e 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -157,8 +157,12 @@ def output_spec(self):
             {
                 ENCODER_OUT: TorchTensorSpec("bxt, h", h=config.output_dim),
                 "state_out": {
-                    "h": TorchTensorSpec("b, h", h=config.hidden_dim),
-                    "c": TorchTensorSpec("b, h", h=config.hidden_dim),
+                    "h": TorchTensorSpec(
+                        "b, l, h", h=config.hidden_dim, l=config.num_layers
+                    ),
+                    "c": TorchTensorSpec(
+                        "b, l, h", h=config.hidden_dim, l=config.num_layers
+                    ),
                 },
             }
         )

From 3a4ea0122879738332eb8b174c5d1ab86c6ef13a Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Wed, 21 Dec 2022 16:54:48 +0100
Subject: [PATCH 13/14] add state out

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 .../algorithms/ppo/tests/test_ppo_rl_module.py | 18 ++++++++++++------
 rllib/core/rl_module/encoder.py                |  5 +++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
index 927c0c9c7665..77f2894fa434 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
@@ -11,7 +11,13 @@
     get_ppo_loss,
     PPOModuleConfig,
 )
-from ray.rllib.core.rl_module.encoder import FCConfig, IdentityConfig, LSTMConfig
+from ray.rllib.core.rl_module.encoder import (
+    FCConfig,
+    IdentityConfig,
+    LSTMConfig,
+    STATE_IN,
+    STATE_OUT,
+)
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.torch_utils import convert_to_torch_tensor
 
@@ -127,7 +133,7 @@ def test_rollouts(self):
                             state_in = tree.map_structure(
                                 lambda x: x[None], convert_to_torch_tensor(state_in)
                             )
-                            batch["state_in"] = state_in
+                            batch[STATE_IN] = state_in
                             batch["seq_lens"] = torch.Tensor([1])
 
                         if fwd_fn == "forward_exploration":
@@ -169,7 +175,7 @@ def test_forward_train(self):
                             if lstm:
                                 input_batch = {
                                     SampleBatch.OBS: convert_to_torch_tensor(obs)[None],
-                                    "state_in": state_in,
+                                    STATE_IN: state_in,
                                     SampleBatch.SEQ_LENS: np.array([1]),
                                 }
                             else:
@@ -189,7 +195,7 @@ def test_forward_train(self):
                                 SampleBatch.DONES: np.array(done),
                             }
                             if lstm:
-                                assert "state_out" in fwd_out
+                                assert STATE_OUT in fwd_out
                                 if tstep > 0:  # First states are already added
 
                                     # Extend nested batches of states
@@ -198,7 +204,7 @@ def test_forward_train(self):
                                         output_states,
                                         state_in,
                                     )
-                                state_in = fwd_out["state_out"]
+                                state_in = fwd_out[STATE_OUT]
                             batches.append(output_batch)
                             obs = new_obs
                             tstep += 1
@@ -211,7 +217,7 @@ def test_forward_train(self):
                             for k, v in batch.items()
                         }
                         if lstm:
-                            fwd_in["state_in"] = output_states
+                            fwd_in[STATE_IN] = output_states
                             fwd_in[SampleBatch.SEQ_LENS] = torch.Tensor([1] * 10)
 
                         # forward train
diff --git a/rllib/core/rl_module/encoder.py b/rllib/core/rl_module/encoder.py
index 729a973fb76e..ceb9cd82f85a 100644
--- a/rllib/core/rl_module/encoder.py
+++ b/rllib/core/rl_module/encoder.py
@@ -15,6 +15,7 @@
 
 ENCODER_OUT = "encoder_out"
 STATE_IN = "state_in"
+STATE_OUT = "state_out"
 
 
 @dataclass
@@ -156,7 +157,7 @@ def output_spec(self):
         return ModelSpec(
             {
                 ENCODER_OUT: TorchTensorSpec("bxt, h", h=config.output_dim),
-                "state_out": {
+                STATE_OUT: {
                     "h": TorchTensorSpec(
                         "b, l, h", h=config.hidden_dim, l=config.num_layers
                     ),
@@ -187,7 +188,7 @@ def _forward(self, input_dict: SampleBatch):
 
         return {
             ENCODER_OUT: x,
-            "state_out": tree.map_structure(lambda x: x.transpose(0, 1), states_o),
+            STATE_OUT: tree.map_structure(lambda x: x.transpose(0, 1), states_o),
         }
 
 

From 5ee9a4d6042fafb3754a3b40fa3e69cd69f57786 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Wed, 21 Dec 2022 17:30:09 +0100
Subject: [PATCH 14/14] add __main__ to test

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
---
 rllib/algorithms/ppo/tests/test_ppo_rl_module.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
index 77f2894fa434..f13790046828 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py
@@ -233,3 +233,10 @@ def test_forward_train(self):
                         for param in module.parameters():
                             pass
                             self.assertIsNotNone(param.grad)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))