From 57e79f9a759d5ad1a4894cf8acb58bf7088778f1 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 16 Nov 2023 19:25:10 +0100
Subject: [PATCH 01/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_context_v2.py      |  66 +++++
 rllib/connectors/connector_pipeline_v2.py     | 259 ++++++++++++++++++
 rllib/connectors/connector_v2.py              |  93 +++++++
 rllib/connectors/env_to_module/__init__.py    |   5 +
 .../env_to_module/default_env_to_module.py    |  69 +++++
 rllib/connectors/input_output_types.py        |  75 +++++
 rllib/connectors/learner/__init__.py          |   0
 .../learner/default_learner_connector.py      | 212 ++++++++++++++
 rllib/connectors/module_to_env/__init__.py    |   0
 .../module_to_env/default_module_to_env.py    |  95 +++++++
 .../tests/test_from_module_connectors.py      | 106 +++++++
 11 files changed, 980 insertions(+)
 create mode 100644 rllib/connectors/connector_context_v2.py
 create mode 100644 rllib/connectors/connector_pipeline_v2.py
 create mode 100644 rllib/connectors/connector_v2.py
 create mode 100644 rllib/connectors/env_to_module/__init__.py
 create mode 100644 rllib/connectors/env_to_module/default_env_to_module.py
 create mode 100644 rllib/connectors/input_output_types.py
 create mode 100644 rllib/connectors/learner/__init__.py
 create mode 100644 rllib/connectors/learner/default_learner_connector.py
 create mode 100644 rllib/connectors/module_to_env/__init__.py
 create mode 100644 rllib/connectors/module_to_env/default_module_to_env.py
 create mode 100644 rllib/connectors/tests/test_from_module_connectors.py

diff --git a/rllib/connectors/connector_context_v2.py b/rllib/connectors/connector_context_v2.py
new file mode 100644
index 0000000000000..628691a9d28f9
--- /dev/null
+++ b/rllib/connectors/connector_context_v2.py
@@ -0,0 +1,66 @@
+from dataclasses import dataclass
+from typing import Any, Optional
+
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.typing import AgentID, EnvType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+@dataclass
+class ConnectorContextV2:
+    """Information needed by pieces of connector pipeline to communicate with each other.
+
+    ConnectorContextV2 will be passed to each connector (pipeline) call.
+    Also might contain references to the RLModule used, the Env, as well as whether
+    `explore` is True or False (whether forward_exploration or forward_inference was
+    used).
+
+    TODO: Describe use cases, e.g.
+     - state out need to be fed back as state ins.
+     Unless we would like to temporarily store the states in the episode.
+     - agent_to_policy_mappings need to be stored as they might be stochastic. Then the
+     to_env pipeline can properly map back from module (formerly known as policy) IDs
+     to agent IDs.
+
+    Attributes:
+        env: The Env object used to reset/step through in the current Env -> Module
+            setup.
+        rl_module: The RLModule used for forward passes in the current Env -> Module
+            setup.
+        explore: Whether `explore` is currently on. Per convention, if True, the
+            RLModule's `forward_exploration` method should be called, if False, the
+            EnvRunner should call `forward_inference` instead.
+        agent_id: The (optional) current agent ID that the connector should be
+            creating/extracting data for.
+        episode_index: The (optional) index within the list of SingleAgentEpisodes or
+            MultiAgentEpisodes, which each connector is given in a call, that belongs
+            to the given agent_id.
+        data: Optional additional context data that needs to be exchanged between
+            different Connector pieces and -pipelines.
+    """
+
+    env: Optional[EnvType] = None
+    rl_module: Optional[RLModule] = None
+    explore: Optional[bool] = None
+    data: Optional[Any] = None
+
+    # TODO (sven): Do these have to be here??
+    agent_id: Optional[AgentID] = None
+    episode_index: Optional[int] = None
+
+    def add_data(self, key, value):
+        assert key not in self.data
+        self.data[key] = value
+
+    def get_data(self, key):
+        assert key in self.data
+        return self.data[key]
+
+    def override_data(self, key, value):
+        assert key in self.data
+        self.data[key] = value
+
+    def del_data(self, key):
+        assert key in self.data
+        del self.data[key]
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
new file mode 100644
index 0000000000000..f5c6c1c181b52
--- /dev/null
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -0,0 +1,259 @@
+from collections import defaultdict
+import logging
+from typing import Any, List, Optional, Union
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
+from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+from ray.util.timer import _Timer
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI(stability="alpha")
+class ConnectorPipelineV2(ConnectorV2):
+    """Utility class for quick manipulation of a connector pipeline."""
+
+    def __init__(
+        self,
+        *,
+        ctx: ConnectorContextV2,
+        connectors: Optional[List[ConnectorV2]] = None,
+        **kwargs,
+    ):
+        super().__init__(ctx=ctx, **kwargs)
+
+        self.connectors = connectors or []
+        self._fix_input_output_types()
+
+        self.timers = defaultdict(_Timer)
+
+    def remove(self, name: str):
+        """Remove a connector piece by <name>.
+
+        Args:
+            name: The name of the connector piece to be removed from the pipeline.
+        """
+        idx = -1
+        for i, c in enumerate(self.connectors):
+            if c.__class__.__name__ == name:
+                idx = i
+                break
+        if idx >= 0:
+            del self.connectors[idx]
+            self._fix_input_output_types()
+            logger.info(f"Removed connector {name} from {self.__class__.__name__}.")
+        else:
+            logger.warning(f"Trying to remove a non-existent connector {name}.")
+
+    def insert_before(self, name: str, connector: ConnectorV2):
+        """Insert a new connector before connector <name>
+
+        Args:
+            name: name of the connector before which a new connector
+                will get inserted.
+            connector: a new connector to be inserted.
+        """
+        idx = -1
+        for idx, c in enumerate(self.connectors):
+            if c.__class__.__name__ == name:
+                break
+        if idx < 0:
+            raise ValueError(f"Can not find connector {name}")
+        self.connectors.insert(idx, connector)
+        self._fix_input_output_types()
+
+        logger.info(
+            f"Inserted {connector.__class__.__name__} before {name} "
+            f"to {self.__class__.__name__}."
+        )
+
+    def insert_after(self, name: str, connector: ConnectorV2):
+        """Insert a new connector after connector <name>
+
+        Args:
+            name: name of the connector after which a new connector
+                will get inserted.
+            connector: a new connector to be inserted.
+        """
+        idx = -1
+        for idx, c in enumerate(self.connectors):
+            if c.__class__.__name__ == name:
+                break
+        if idx < 0:
+            raise ValueError(f"Can not find connector {name}")
+        self.connectors.insert(idx + 1, connector)
+        self._fix_input_output_types()
+
+        logger.info(
+            f"Inserted {connector.__class__.__name__} after {name} "
+            f"to {self.__class__.__name__}."
+        )
+
+    def prepend(self, connector: ConnectorV2):
+        """Append a new connector at the beginning of a connector pipeline.
+
+        Args:
+            connector: a new connector to be appended.
+        """
+        self.connectors.insert(0, connector)
+        self._fix_input_output_types()
+
+        logger.info(
+            f"Added {connector.__class__.__name__} to the beginning of "
+            f"{self.__class__.__name__}."
+        )
+
+    def append(self, connector: ConnectorV2):
+        """Append a new connector at the end of a connector pipeline.
+
+        Args:
+            connector: a new connector to be appended.
+        """
+        self.connectors.append(connector)
+        self._fix_input_output_types()
+
+        logger.info(
+            f"Added {connector.__class__.__name__} to the end of "
+            f"{self.__class__.__name__}."
+        )
+
+    def __call__(
+        self,
+        input_: Any,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+    ) -> Any:
+        ret = input_
+        for connector in self.connectors:
+            timer = self.timers[str(connector)]
+            with timer:
+                ret = connector(input_=ret, episodes=episodes, ctx=ctx)
+        return ret
+
+    # @override(ConnectorV2)
+    # def serialize(self):
+    #    children = []
+    #    for c in self.connectors:
+    #        state = c.serialize()
+    #        assert isinstance(state, tuple) and len(state) == 2, (
+    #            "Serialized connector state must be in the format of "
+    #            f"Tuple[name: str, params: Any]. Instead we got {state}"
+    #            f"for connector {c.__name__}."
+    #        )
+    #        children.append(state)
+    #    return ConnectorPipelineV2.__name__, children
+    #
+    # @override(ConnectorV2)
+    # @staticmethod
+    # def from_state(ctx: ConnectorContextV2, params: List[Any]):
+    #    assert (
+    #        type(params) == list
+    #    ), "AgentConnectorPipeline takes a list of connector params."
+    #    connectors = []
+    #    for state in params:
+    #        try:
+    #            name, subparams = state
+    #            connectors.append(get_connector(name, ctx, subparams))
+    #        except Exception as e:
+    #            logger.error(f"Failed to de-serialize connector state: {state}")
+    #            raise e
+    #    return ConnectorPipelineV2(ctx, connectors)
+
+    def __str__(self, indentation: int = 0):
+        return "\n".join(
+            [" " * indentation + self.__class__.__name__]
+            + [c.__str__(indentation + 4) for c in self.connectors]
+        )
+
+    def __getitem__(self, key: Union[str, int, type]):
+        """Returns a list of connectors that fit 'key'.
+
+        If key is a number n, we return a list with the nth element of this pipeline.
+        If key is a Connector class or a string matching the class name of a
+        Connector class, we return a list of all connectors in this pipeline matching
+        the specified class.
+
+        Args:
+            key: The key to index by
+
+        Returns: The Connector at index `key`.
+        """
+        # In case key is a class
+        if not isinstance(key, str):
+            if isinstance(key, slice):
+                raise NotImplementedError(
+                    "Slicing of ConnectorPipeline is currently not supported."
+                )
+            elif isinstance(key, int):
+                return [self.connectors[key]]
+            elif isinstance(key, type):
+                results = []
+                for c in self.connectors:
+                    if issubclass(c.__class__, key):
+                        results.append(c)
+                return results
+            else:
+                raise NotImplementedError(
+                    "Indexing by {} is currently not supported.".format(type(key))
+                )
+
+        results = []
+        for c in self.connectors:
+            if c.__class__.__name__ == key:
+                results.append(c)
+
+        return results
+
+    def _fix_input_output_types(self):
+        if len(self.connectors) > 0:
+            self.input_type = self.connectors[0].input_type
+            self.output_type = self.connectors[-1].output_type
+        else:
+            self.input_type = None
+            self.output_type = None
+
+
+class EnvToModulePipeline(ConnectorPipelineV2):
+    def __init__(
+        self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs
+    ):
+        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+        # Add the default final connector piece for env-to-module pipelines:
+        # Extracting last obs from episodes and add them to input, iff this has not
+        # happened in any connector piece in this pipeline before.
+        if (
+            len(self.connectors) == 0
+            or type(self.connectors[-1]) is not DefaultEnvToModule
+        ):
+            self.append(DefaultEnvToModule(ctx=ctx))
+
+    def __call__(self, *, input_: Optional[Any] = None, episodes, ctx, **kwargs):
+        # Make sure user does not necessarily send initial input into this pipeline.
+        # Might just be empty and to be populated from `episodes`.
+        return super().__call__(
+            input_=input_ or {},
+            episodes=episodes,
+            ctx=ctx,
+            **kwargs,
+        )
+
+
+class ModuleToEnvPipeline(ConnectorPipelineV2):
+    def __init__(
+        self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs
+    ):
+        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+
+        # Add the default final connector piece for env-to-module pipelines:
+        # Sampling actions from action_dist_inputs and add them to input, iff this has
+        # not happened in any connector piece in this pipeline before.
+        if (
+            len(self.connectors) == 0
+            or type(self.connectors[-1]) is not DefaultModuleToEnv
+        ):
+            self.append(DefaultModuleToEnv(ctx=ctx))
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
new file mode 100644
index 0000000000000..ba18a422b36e4
--- /dev/null
+++ b/rllib/connectors/connector_v2.py
@@ -0,0 +1,93 @@
+import abc
+from typing import Any, List, Tuple
+
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.connectors.input_output_types import INPUT_OUTPUT_TYPES
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class ConnectorV2(abc.ABC):
+    """Connector base class.
+
+    A connector performs a transformation step, either on envrionment data before it
+    gets to the RLModule, or on RLModule output before it is sent back to the
+    environment.
+
+    Connectors may be training-aware, for example, behave slightly differently
+    during training and inference.
+
+    All connectors are required to be serializable and implement the `serialize()` method.
+    """
+
+    # Set these in ALL subclasses.
+    input_type = INPUT_OUTPUT_TYPES.DATA
+    output_type = INPUT_OUTPUT_TYPES.DATA
+
+    def __init__(self, *, ctx: ConnectorContextV2, **kwargs):
+        """Initializes a ConnectorV2 instance.
+
+        Args:
+            ctx: The current ConnectorContextV2.
+            **kwargs: Forward API-compatibility kwargs.
+        """
+        self.ctx = ctx
+
+    @abc.abstractmethod
+    def __call__(
+        self,
+        *,
+        input_: Any,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+        **kwargs,
+    ) -> Any:
+        """Method for transforming input data into output data.
+
+        Args:
+            input_: The input data abiding to `self.input_type` to be transformed by
+                this connector. Transformations might either be done in-place or a new
+                structure may be returned that matches `self.output_type`.
+            episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects,
+                each corresponding to one slot in the vector env. Note that episodes
+                should always be considered read-only and not be altered.
+            ctx: The ConnectorContext that might be used to pass along other important
+                information in between connector pieces (even across pipelines).
+            kwargs: Forward API-compatibility kwargs.
+
+        Returns:
+            The transformed connector output abiding to `self.output_type`.
+        """
+
+    def __str__(self, indentation: int = 0):
+        return " " * indentation + self.__class__.__name__
+
+    # @abc.abstractmethod
+    # def serialize(self) -> Tuple[str, Any]:
+    #    """Serialize a connector into a JSON serializable Tuple.
+
+    #    `serialize()` is required, so that all Connectors are serializable.
+
+    #    Returns:
+    #        A tuple of connector's name and its serialized states.
+    #        String should match the name used to register the connector,
+    #        while state can be any single data structure that contains the
+    #        serialized state of the connector. If a connector is stateless,
+    #        state can simply be None.
+    #    """
+
+    # @staticmethod
+    # @abc.abstractmethod
+    # def from_state(ctx: ConnectorContextV2, params: Any) -> "ConnectorV2":
+    #    """De-serialize a JSON params back into a Connector.
+
+    #    `from_state()` is required, so that all Connectors are serializable.
+
+    #    Args:
+    #        ctx: ConnectorContextV2 for constructing this connector.
+    #        params: Serialized states of the connector to be recovered.
+
+    #    Returns:
+    #        De-serialized connector.
+    #    """
diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py
new file mode 100644
index 0000000000000..b86c2f9cb002f
--- /dev/null
+++ b/rllib/connectors/env_to_module/__init__.py
@@ -0,0 +1,5 @@
+from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
+
+__all__ = [
+    "DefaultEnvToModule",
+]
diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py
new file mode 100644
index 0000000000000..0b9eb2d8669a5
--- /dev/null
+++ b/rllib/connectors/env_to_module/default_env_to_module.py
@@ -0,0 +1,69 @@
+from typing import Any, List
+
+import numpy as np
+
+import tree
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.core.models.base import STATE_IN, STATE_OUT
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.spaces.space_utils import batch
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class DefaultEnvToModule(ConnectorV2):
+    """Default env-to-module-connector always in the pipeline at the very end.
+
+    Makes sure that there is at least an observation (the most recent one) for each
+    agent as well as a state - in case the RLModule is recurrent. Doesn't do anything
+    in case other pieces in the pipeline already take care of populating these fields.
+
+    TODO: Generalize to MultiAgentEpisodes.
+    """
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        input_: Any,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+        **kwargs,
+    ):
+        # If obs are not already part of the input, add the most recent ones (from all
+        # single-agent episodes).
+        if SampleBatch.OBS not in input_:
+            observations = []
+            for episode in episodes:
+                # Make sure, we have at least one observation in the episode.
+                assert len(episode.observations) > 0
+                observations.append(episode.observations[-1])
+            input_[SampleBatch.OBS] = batch(observations)
+
+        # If our module is recurrent:
+        # - Add the most recent states to the inputs.
+        # - Make all inputs have T=1.
+        if ctx.rl_module.is_stateful():
+            states = []
+            for episode in episodes:
+                # Make sure, we have at least one observation in the episode.
+                assert episode.observations
+
+                # TODO: Generalize to MultiAgentEpisodes.
+                # Episode just started, get initial state from our RLModule.
+                if len(episode) == 0:
+                    state = ctx.rl_module.get_initial_state()
+                else:
+                    state = episode.extra_model_outputs[STATE_OUT][-1]
+                states.append(state)
+
+            # Make all other inputs have an additional T=1 axis.
+            input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_)
+
+            # Batch states (from list of individual vector sub-env states).
+            # Note that state ins should NOT have the extra time dimension.
+            input_[STATE_IN] = batch(states)
+
+        return input_
diff --git a/rllib/connectors/input_output_types.py b/rllib/connectors/input_output_types.py
new file mode 100644
index 0000000000000..da9343c040678
--- /dev/null
+++ b/rllib/connectors/input_output_types.py
@@ -0,0 +1,75 @@
+from enum import Enum
+
+
+class INPUT_OUTPUT_TYPES(Enum):
+    """Definitions of possible datatypes being processed by individual connectors.
+
+    TODO: Make sure this is valid:
+     Each connector will always receive a list of Episodes (MultiAgentEpisodes or
+     SingleAgentEpisodes, depending on the setup and EnvRunner used). In addition, the
+     output of the previous connector (or an empty dict at the beginnnig) will be
+     received.
+     An IntoModule connector pipeline should eventually output a dict mapping module IDs
+     to SampleBatches
+
+    Typical env-module-env pipeline:
+        env.step(List[Data]) -> List[MultiAgentEpisode]
+
+        connector: auto-agent-extraction: List[MultiAgentEpisode] -> dict[AgentID, Data]
+        connector: auto-broadcast: Data -> Data (legacy postprocessing and filtering)
+            under the hood: dict[AgentID, Data] -> dict[AgentID, Data]
+        connector: auto-policy-mapping: dict[AgentID, Data] -> dict[ModuleID, Data]
+
+        module.forward_exploration() -> dict[ModuleID, Data]
+
+        connector: auto-action-sampling: dict[ModuleID, Data] -> dict[ModuleID, Data]
+        connector: action-clipping: Data -> Data
+            under the hood: dict[ModuleID, Data] -> dict[ModuleID, Data]
+        connector: auto-policy-unmapping: dict[ModuleID, Data] -> dict[AgentID, Data]
+            (using information stored in connector ctx)
+        connector: auto-action-sorting (using information stored in connector ctx):
+            dict[AgentID, Data] -> List[Data]
+
+        env.step(List[Data]) ... repeats
+
+    Typical training pipeline:
+
+
+    Default env-module-env pipeline picked by RLlib if no connector defined by user AND
+    module is RNN:
+        env.step(List[Data]) -> List[MultiAgentEpisode]
+
+        connector: auto-agent-extraction: List[MultiAgentEpisode] -> dict[AgentID, Data]
+        connector: auto-policy-mapping: dict[AgentID, Data] -> dict[ModuleID, Data]
+        connector: auto-state-handling: dict[ModuleID, Data] ->
+            dict[ModuleID, Data + state] (using information stored in connector ctx)
+
+        module.forward_exploration() -> dict[ModuleID, Data + state]
+
+        connector: auto-state-handling: dict[ModuleID, Data + state] ->
+            dict[ModuleID, Data] (state was stored in ctx)
+        connector: auto-policy-unmapping: dict[ModuleID, Data] ->
+            dict[AgentID, Data] (using information stored in connector ctx)
+        connector: auto-action-sorting (using information stored in connector ctx):
+            dict[AgentID, Data] -> List[Data]
+
+        env.step(List[Data]) ... repeats
+    """
+
+    # Normally, after `env.step()`, we have a list (vector env) of MultiAgentEpisodes
+    # as a starting point.
+    LIST_OF_MULTI_AGENT_EPISODES = 0
+    # In the simplified case, there might be a list of SingleAgentEpisodes, instead.
+    LIST_OF_SINGLE_AGENT_EPISODES = 1
+
+    # From each MultiAgentEpisode, we might extract a dict, mapping agent IDs to data.
+    LIST_OF_DICTS_MAPPING_AGENT_IDS_TO_DATA = 10
+    # Eventually boiling down to simply one dict mapping agent IDs to data.
+    #
+    DICT_MAPPING_AGENT_IDS_TO_DATA = 11
+
+    # Right after the module's forward pass, we usually have a single dict mapping
+    # Module IDs to data (model outputs).
+    DICT_MAPPING_MODULE_IDS_TO_DATA = 12
+
+    DATA = 11
diff --git a/rllib/connectors/learner/__init__.py b/rllib/connectors/learner/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py
new file mode 100644
index 0000000000000..592faa711a4a6
--- /dev/null
+++ b/rllib/connectors/learner/default_learner_connector.py
@@ -0,0 +1,212 @@
+from functools import partial
+from typing import Any
+
+import numpy as np
+import tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.core.models.base import STATE_IN, STATE_OUT
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.numpy import convert_to_numpy
+
+
+class DefaultLearnerConnector(ConnectorV2):
+    """Connector added by default by RLlib to the end of the learner connector pipeline.
+
+    If provided with `episodes` data, this connector piece makes sure that the final
+    train batch going into the RLModule for updating (`forward_train()` call) contains
+    at the minimum:
+    - Observations: From all episodes under the SampleBatch.OBS key.
+    - Actions, rewards, terminal/truncation flags: From all episodes under the
+    respective keys.
+    - All data inside the episodes' `extra_model_outs` property, e.g. action logp and
+    action probs.
+    - States: If the RLModule is stateful, the episodes' STATE_OUTS will be extracted
+    and restructured under a new STATE_IN key in such a way that the resulting STATE_IN
+    batch has the shape (B', ...). Here, B' is the sum of splits we have to do over
+    the given episodes, such that each chunk is at most `max_seq_len` long (T-axis).
+    Also, all other data will be properly reshaped into (B, T=max_seq_len, ...) and
+    will be zero-padded, if necessary.
+
+    If the user wants to customize their own data under the given keys (e.g. obs,
+    actions, ...), they can extract from the episodes or recompute from `input_`
+    their own data and store it under those keys (in `input_`). In such a case, this
+    connector will not touch the data under these keys.
+    """
+
+    def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
+        # If episodes are provided, extract the essential data from them, but only if
+        # this data is not present yet in `input_`.
+        if not episodes:
+            return input_
+
+        # Get data dicts for all episodes.
+        data_dicts = [episode.get_data_dict() for episode in episodes]
+
+        state_in = None
+        T = ctx.rl_module.config.model_config_dict.get("max_seq_len")
+
+        # Special handling of STATE_OUT/STATE_IN keys:
+        if ctx.rl_module.is_stateful() and STATE_IN not in input_:
+            if T is None:
+                raise ValueError(
+                    "You are using a stateful RLModule and are not providing custom "
+                    f"'{STATE_IN}' data through your connector(s)! Therefore, you need "
+                    "to provide the 'max_seq_len' key inside your model config dict. "
+                    "You can set this dict and/or override keys in it via "
+                    "`config.training(model={'max_seq_len': x})`."
+                )
+            # Get model init state.
+            init_state = convert_to_numpy(ctx.rl_module.get_initial_state())
+            # Get STATE_OUTs for all episodes and only keep those (as STATE_INs) that
+            # are located at the `max_seq_len` edges (state inputs to RNNs only have a
+            # B-axis, no T-axis).
+            state_ins = []
+            for episode, data_dict in zip(episodes, data_dicts):
+                # Remove state outs (should not be part of the T-axis rearrangements).
+                state_outs = data_dict.pop(STATE_OUT)
+                state_ins.append(
+                    tree.map_structure(
+                        # [::T] = only keep every Tth (max_seq_len) state in.
+                        # [:-1] = shift state outs by one (ignore very last state out, but
+                        # therefore add the init state at the beginning).
+                        lambda i, o: np.concatenate([[i], o[:-1]])[::T],
+                        (
+                            # Episode has a (reset) beginning -> Prepend initial state.
+                            init_state
+                            if episode.t_started == 0
+                            # Episode starts somewhere in the middle (is a cut continuation
+                            # chunk) -> Use previous chunk's last STATE_OUT as initial state.
+                            else episode.get_extra_model_outputs(
+                                key=STATE_OUT, indices=-len(episode) - 1
+                            )
+                        ),
+                        state_outs,
+                    )
+                )
+            # Concatenate the individual episodes' state ins.
+            state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins)
+
+            # Before adding anything else to the `input_`, add the time axis to existing
+            # data.
+            input_ = tree.map_structure(
+                lambda s: split_and_pad_single_record(s, episodes, T=T),
+                input_,
+            )
+
+            # Set the reduce function for all the data we might still have to extract
+            # from our list of episodes. This function takes a list of data (e.g. obs)
+            # with each item in the list representing one episode and properly
+            # splits along the time axis and zero-pads if necessary (based on
+            # max_seq_len).
+            reduce_fn = partial(split_and_pad, T=T)
+
+        # No stateful module, normal batch (w/o T-axis or zero-padding).
+        else:
+            # Set the reduce function for all the data we might still have to extract
+            # from our list of episodes. Simply concatenate the data from the different
+            # episodes along the batch axis (axis=0).
+            reduce_fn = np.concatenate
+
+        # Extract all data from the episodes, if not already in `input_`.
+        for key in [
+            SampleBatch.OBS,
+            SampleBatch.ACTIONS,
+            SampleBatch.REWARDS,
+            SampleBatch.TERMINATEDS,
+            SampleBatch.TRUNCATEDS,
+            SampleBatch.T,  # TODO: remove (normally not needed in train batch)
+            *episodes[0].extra_model_outputs.keys(),
+        ]:
+            if key not in input_ and key != STATE_OUT:
+                # Concatenate everything together (along B-axis=0).
+                input_[key] = tree.map_structure(
+                    lambda *s: reduce_fn(s),
+                    *[d[key] for d in data_dicts],
+                )
+
+        # Infos (always as lists).
+        # TODO:uncomment if SampleBatch.INFOS not in input_:
+        #    input_[SampleBatch.INFOS] = sum(
+        #        [d[SampleBatch.INFOS] for d in data_dicts],
+        #        [],
+        #    )
+
+        if ctx.rl_module.is_stateful():
+            # Now that all "normal" fields are time-dim'd and zero-padded, add
+            # the STATE_IN column to `input_`.
+            input_[STATE_IN] = state_in
+            # Create the zero-padding loss mask.
+            (
+                input_["loss_mask"],
+                input_[SampleBatch.SEQ_LENS],
+            ) = create_mask_and_seq_lens(
+                episode_lens=[len(episode) for episode in episodes],
+                T=T,
+            )
+
+        return input_
+
+
+def split_and_pad(episodes_data, T):
+    all_chunks = []
+
+    for data in episodes_data:
+        num_chunks = int(np.ceil(data.shape[0] / T))
+
+        for i in range(num_chunks):
+            start_index = i * T
+            end_index = start_index + T
+
+            # Extract the chunk
+            chunk = data[start_index:end_index]
+
+            # Pad the chunk if it's shorter than T
+            if chunk.shape[0] < T:
+                padding_shape = [(0, T - chunk.shape[0])] + [
+                    (0, 0) for _ in range(chunk.ndim - 1)
+                ]
+                chunk = np.pad(chunk, pad_width=padding_shape, mode="constant")
+
+            all_chunks.append(chunk)
+
+    # Combine all chunks into a single array
+    result = np.concatenate(all_chunks, axis=0)
+
+    # Reshape the array to include the time dimension T
+    # The new shape should be (-1, T) + original dimensions (excluding the batch dimension)
+    result = result.reshape((-1, T) + result.shape[1:])
+
+    return result
+
+
+def split_and_pad_single_record(data, episodes, T):
+    episodes_data = []
+    idx = 0
+    for episode in episodes:
+        len_ = len(episode)
+        episodes_data.append(data[idx : idx + len_])
+        idx += len_
+    return split_and_pad(episodes_data, T)
+
+
+def create_mask_and_seq_lens(episode_lens, T):
+    mask = []
+    seq_lens = []
+    for episode_len in episode_lens:
+        len_ = min(episode_len, T)
+        seq_lens.append(len_)
+        row = [1] * len_ + [0] * (T - len_)
+        mask.append(row)
+
+        # Handle sequence lengths greater than T.
+        overflow = episode_len - T
+        while overflow > 0:
+            len_ = min(overflow, T)
+            seq_lens.append(len_)
+            extra_row = [1] * len_ + [0] * (T - len_)
+            mask.append(extra_row)
+            overflow -= T
+
+    return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32)
diff --git a/rllib/connectors/module_to_env/__init__.py b/rllib/connectors/module_to_env/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py
new file mode 100644
index 0000000000000..5bf0a2af0c8a4
--- /dev/null
+++ b/rllib/connectors/module_to_env/default_module_to_env.py
@@ -0,0 +1,95 @@
+from typing import Any
+
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.core.models.base import STATE_OUT
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class DefaultModuleToEnv(ConnectorV2):
+    """A connector that samples actions given action dist. inputs and a dist. class.
+
+    The connector will only sample from the distribution, if the ACTIONS key
+    cannot be found in the connector's input. Otherwise, it'll behave simply as pass
+    through (noop). If ACTIONS is not present, but ACTION_DIST_INPUTS are, will create
+    a distribution from the RLModule and sample from it (deterministically, if
+    we are not exploring, stochastically, if we are).
+
+    input_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA
+        Operates per RLModule as it will have to pull the action distribution from each
+        in order to sample actions if necessary. Searches for the ACTIONS and
+        ACTION_DIST_INPUTS keys in a module's outputs and - should ACTIONS not be found -
+        sample actions from the module's action distribution.
+    output_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA (same as input: data in,
+        data out, however, data
+        out might contain an additional ACTIONS key if it was not previously present
+        in the input).
+    """
+
+    @override(ConnectorV2)
+    def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any:
+
+        # Loop through all modules that created some output.
+        # for mid in input_.keys():
+        #    sa_module = ctx.rl_module.get_module(module_id=mid)
+
+        # If our RLModule is stateful, remove the T=1 axis from all model outputs
+        # (except the state outs, which never have this extra time axis).
+        if ctx.rl_module.is_stateful():
+            state = input_.pop(STATE_OUT, None)
+            input_ = tree.map_structure(lambda s: np.squeeze(s, axis=1), input_)
+            if state:
+                input_[STATE_OUT] = state
+
+        # ACTION_DIST_INPUTS field returned by `forward_exploration()` ->
+        # Create a distribution object.
+        action_dist = None
+        # The RLModule has already computed actions.
+        if (
+            SampleBatch.ACTION_DIST_INPUTS in input_
+            and SampleBatch.ACTION_LOGP not in input_
+        ):
+            dist_inputs = input_[SampleBatch.ACTION_DIST_INPUTS]
+            if ctx.explore:
+                action_dist_class = ctx.rl_module.get_exploration_action_dist_cls()
+            else:
+                action_dist_class = ctx.rl_module.get_inference_action_dist_cls()
+            action_dist = action_dist_class.from_logits(dist_inputs)
+            if not ctx.explore:
+                action_dist = action_dist.to_deterministic()
+
+        # If `forward_...()` returned actions, use them here as-is.
+        if SampleBatch.ACTIONS in input_:
+            actions = input_[SampleBatch.ACTIONS]
+        # Otherwise, sample actions from the distribution.
+        else:
+            if action_dist is None:
+                raise KeyError(
+                    "Your RLModule's `forward_[explore|inference]()` methods must "
+                    f"return a dict with either the {SampleBatch.ACTIONS} key or "
+                    f"the {SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
+                )
+            actions = action_dist.sample()
+            input_[SampleBatch.ACTIONS] = actions
+
+        # Compute action-logp and action-prob from distribution and add to
+        # output, if possible.
+        if action_dist is not None and SampleBatch.ACTION_LOGP not in input_:
+            input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions)
+
+        return input_
+
+    # @override(Connector)
+    # def serialize(self):
+    #    return ClipActions.__name__, None
+
+    # @staticmethod
+    # TODO
+    # def from_state(ctx: ConnectorContext, params: Any):
+    #    return ClipActions(ctx)
diff --git a/rllib/connectors/tests/test_from_module_connectors.py b/rllib/connectors/tests/test_from_module_connectors.py
new file mode 100644
index 0000000000000..ac0844ff46f0f
--- /dev/null
+++ b/rllib/connectors/tests/test_from_module_connectors.py
@@ -0,0 +1,106 @@
+import unittest
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.connectors.into_env.clip_actions import ClipActions
+from ray.rllib.connectors.into_env.unsquash_actions import UnsquashActions
+from ray.rllib.connectors.connector import ConnectorContextV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.test_utils import check
+
+
+class TestFromModuleConnectors(unittest.TestCase):
+    def test_connector_pipeline(self):
+        ctx = ConnectorContext()
+        connectors = [ConvertToNumpyConnector(ctx)]
+        pipeline = ActionConnectorPipeline(ctx, connectors)
+        name, params = pipeline.serialize()
+        restored = get_connector(name, ctx, params)
+        self.assertTrue(isinstance(restored, ActionConnectorPipeline))
+        self.assertTrue(isinstance(restored.connectors[0], ConvertToNumpyConnector))
+        # There should not be any timer yet
+        self.assertFalse(bool(pipeline.timers.values()))
+        pipeline(ActionConnectorDataType(0, 0, {}, ([1], [], None)))
+        # After a first input, there should be one timer
+        self.assertEquals(len(pipeline.timers.values()), 1)
+
+    def test_clip_actions_connector(self):
+        ctx = ConnectorContextV2()
+
+        connector = ClipActions(
+            action_space=gym.spaces.Box(low=0.0, high=6.0, shape=(1,))
+        )
+
+        # name, params = connector.serialize()
+        # self.assertEqual(name, "ClipActions")
+
+        # restored = get_connector(name, ctx, params)
+        # self.assertTrue(isinstance(restored, ClipActionsConnector))
+
+        for action in [8.8, 6.0, -0.2, 0.0, 5.9999, 3.2, 6.1]:
+            output = connector(
+                {SampleBatch.ACTIONS: np.array([action])},
+                ctx,
+            )
+            check(output[SampleBatch.ACTIONS], np.clip(action, 0.0, 6.0))
+
+        connector = ClipActions(
+            action_space=gym.spaces.Dict(
+                {
+                    "a": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,)),
+                    "b": gym.spaces.Discrete(3),
+                }
+            )
+        )
+        for action in [
+            {"a": np.array([8.8, 8.9]), "b": 1},
+            {"a": np.array([9.0, -1.0]), "b": 0},
+            {"a": np.array([100.0, 200.0]), "b": 2},
+            {"a": np.array([-1000, 0.0001]), "b": 2},
+            {"a": np.array([0.4, 1.2]), "b": 0},
+            {"a": np.array([1.0, -1.0]), "b": 1},
+        ]:
+            output = connector({SampleBatch.ACTIONS: action}, ctx)
+            check(
+                output[SampleBatch.ACTIONS],
+                {"a": np.clip(action["a"], -1.0, 1.0), "b": action["b"]},
+            )
+
+    def test_unsquash_actions_connector(self):
+        ctx = ConnectorContextV2()
+
+        connector = UnsquashActions(
+            action_space=gym.spaces.Box(low=-2.0, high=6.0, shape=(2,))
+        )
+
+        # name, params = connector.serialize()
+        # self.assertEqual(name, "UnsquashActions")
+
+        # restored = get_connector(name, ctx, params)
+        # self.assertTrue(isinstance(restored, NormalizeActionsConnector))
+
+        for action in [
+            [1.8, 1.8],
+            [1.0, -1.0],
+            [-1.0, 1.1],
+            [0.0, 0.0],
+            [10.0, 0.5],
+            [0.5, -0.5],
+        ]:
+            action = np.array(action)
+            output = connector(
+                {SampleBatch.ACTIONS: action},
+                ctx,
+            )
+            check(
+                output[SampleBatch.ACTIONS],
+                np.clip((action + 1.0) * 4.0 - 2.0, -2.0, 6.0),
+            )
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))

From 99d9019b735258376235c85832514fe916081b0a Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 17 Nov 2023 11:47:52 +0100
Subject: [PATCH 02/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_context_v2.py      |  16 +-
 rllib/connectors/connector_pipeline_v2.py     |  82 +++-------
 rllib/connectors/connector_v2.py              | 102 +++++++-----
 .../env_to_module/default_env_to_module.py    |  46 +++---
 .../env_to_module/env_to_module_pipeline.py   |  47 ++++++
 .../learner/default_learner_connector.py      | 148 ++++++------------
 .../module_to_env/default_module_to_env.py    |  68 ++++----
 .../module_to_env/module_to_env_pipeline.py   |  27 ++++
 rllib/connectors/utils/__init__.py            |   0
 rllib/connectors/utils/zero_padding.py        | 135 ++++++++++++++++
 10 files changed, 413 insertions(+), 258 deletions(-)
 create mode 100644 rllib/connectors/env_to_module/env_to_module_pipeline.py
 create mode 100644 rllib/connectors/module_to_env/module_to_env_pipeline.py
 create mode 100644 rllib/connectors/utils/__init__.py
 create mode 100644 rllib/connectors/utils/zero_padding.py

diff --git a/rllib/connectors/connector_context_v2.py b/rllib/connectors/connector_context_v2.py
index 628691a9d28f9..fff114618dd15 100644
--- a/rllib/connectors/connector_context_v2.py
+++ b/rllib/connectors/connector_context_v2.py
@@ -24,13 +24,15 @@ class ConnectorContextV2:
      to agent IDs.
 
     Attributes:
-        env: The Env object used to reset/step through in the current Env -> Module
-            setup.
-        rl_module: The RLModule used for forward passes in the current Env -> Module
-            setup.
+        env: The Env object used to reset/step through in the current Env->Module
+            setup. This will be None in contexts used in a Learner connector pipeline.
+        rl_module: The RLModule used for either action computing forward passes
+            (`forward_exploration|inference()`) in the current Env->Module setup
+            or `forward_train()` calls in a Learner connector pipeline.
         explore: Whether `explore` is currently on. Per convention, if True, the
-            RLModule's `forward_exploration` method should be called, if False, the
-            EnvRunner should call `forward_inference` instead.
+            RLModule's `forward_exploration()` method should be called, if False, the
+            EnvRunner should call `forward_inference()` instead. Should be None inside
+            Learner connector pipelines.
         agent_id: The (optional) current agent ID that the connector should be
             creating/extracting data for.
         episode_index: The (optional) index within the list of SingleAgentEpisodes or
@@ -38,6 +40,8 @@ class ConnectorContextV2:
             to the given agent_id.
         data: Optional additional context data that needs to be exchanged between
             different Connector pieces and -pipelines.
+
+        TODO (sven): Maybe we should have to AlgorithmConfig here as well.
     """
 
     env: Optional[EnvType] = None
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index f5c6c1c181b52..f3d9c36508682 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 import logging
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Type, Union
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
@@ -32,8 +32,24 @@ def __init__(
 
         self.timers = defaultdict(_Timer)
 
-    def remove(self, name: str):
-        """Remove a connector piece by <name>.
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        input_: Any,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+        **kwargs,
+    ) -> Any:
+        """"""
+        ret = input_
+        for connector in self.connectors:
+            timer = self.timers[str(connector)]
+            with timer:
+                ret = connector(input_=ret, episodes=episodes, ctx=ctx)
+        return ret
+
+    def remove(self, name_or_class: Union[str, Type]):
+        """Remove a single connector piece in this pipeline by its name or class.
 
         Args:
             name: The name of the connector piece to be removed from the pipeline.
@@ -50,7 +66,7 @@ def remove(self, name: str):
         else:
             logger.warning(f"Trying to remove a non-existent connector {name}.")
 
-    def insert_before(self, name: str, connector: ConnectorV2):
+    def insert_before(self, name_or_class: Union[str, Type], connector: ConnectorV2):
         """Insert a new connector before connector <name>
 
         Args:
@@ -72,7 +88,7 @@ def insert_before(self, name: str, connector: ConnectorV2):
             f"to {self.__class__.__name__}."
         )
 
-    def insert_after(self, name: str, connector: ConnectorV2):
+    def insert_after(self, name_or_class: Union[str, Type], connector: ConnectorV2):
         """Insert a new connector after connector <name>
 
         Args:
@@ -122,19 +138,6 @@ def append(self, connector: ConnectorV2):
             f"{self.__class__.__name__}."
         )
 
-    def __call__(
-        self,
-        input_: Any,
-        episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
-    ) -> Any:
-        ret = input_
-        for connector in self.connectors:
-            timer = self.timers[str(connector)]
-            with timer:
-                ret = connector(input_=ret, episodes=episodes, ctx=ctx)
-        return ret
-
     # @override(ConnectorV2)
     # def serialize(self):
     #    children = []
@@ -179,7 +182,7 @@ def __getitem__(self, key: Union[str, int, type]):
         the specified class.
 
         Args:
-            key: The key to index by
+            key: The key to index by.
 
         Returns: The Connector at index `key`.
         """
@@ -216,44 +219,3 @@ def _fix_input_output_types(self):
         else:
             self.input_type = None
             self.output_type = None
-
-
-class EnvToModulePipeline(ConnectorPipelineV2):
-    def __init__(
-        self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs
-    ):
-        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
-        # Add the default final connector piece for env-to-module pipelines:
-        # Extracting last obs from episodes and add them to input, iff this has not
-        # happened in any connector piece in this pipeline before.
-        if (
-            len(self.connectors) == 0
-            or type(self.connectors[-1]) is not DefaultEnvToModule
-        ):
-            self.append(DefaultEnvToModule(ctx=ctx))
-
-    def __call__(self, *, input_: Optional[Any] = None, episodes, ctx, **kwargs):
-        # Make sure user does not necessarily send initial input into this pipeline.
-        # Might just be empty and to be populated from `episodes`.
-        return super().__call__(
-            input_=input_ or {},
-            episodes=episodes,
-            ctx=ctx,
-            **kwargs,
-        )
-
-
-class ModuleToEnvPipeline(ConnectorPipelineV2):
-    def __init__(
-        self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs
-    ):
-        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
-
-        # Add the default final connector piece for env-to-module pipelines:
-        # Sampling actions from action_dist_inputs and add them to input, iff this has
-        # not happened in any connector piece in this pipeline before.
-        if (
-            len(self.connectors) == 0
-            or type(self.connectors[-1]) is not DefaultModuleToEnv
-        ):
-            self.append(DefaultModuleToEnv(ctx=ctx))
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index ba18a422b36e4..0c80ab64d2228 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -1,5 +1,5 @@
 import abc
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.connectors.input_output_types import INPUT_OUTPUT_TYPES
@@ -9,19 +9,45 @@
 
 @PublicAPI(stability="alpha")
 class ConnectorV2(abc.ABC):
-    """Connector base class.
-
-    A connector performs a transformation step, either on envrionment data before it
-    gets to the RLModule, or on RLModule output before it is sent back to the
-    environment.
-
-    Connectors may be training-aware, for example, behave slightly differently
-    during training and inference.
-
-    All connectors are required to be serializable and implement the `serialize()` method.
+    """Base class defining the API for an individual "connector piece".
+
+    A ConnectorV2 ("connector piece") is usually part of a series of pieces within
+    a "connector pipeline", which in itself also abides to this very API.
+    For example, you might have a connector pipeline consisting of two connector pieces,
+    A and B, both instances of subclasses of ConnectorV2 and each one performing a
+    particular transformation on their input data. The resulting connector pipeline
+    (A->B) itself also abides to this very ConnectorV2 API and could thus be part of yet
+    another, higher-level connector pipeline.
+
+    Any ConnectorV2 instances (individual pieces or several connector pieces in a
+    pipeline) must be callable by overriding their `__call__()` method. When called,
+    they take the outputs of a previous connector piece (or an empty dict if there are
+    no previous pieces) as well as all the data collected thus far in the ongoing
+    episode(s) (only applies to connectors used in EnvRunners) or retrieved from a
+    replay buffer or from an environment sampling step (only applies to connectors used
+    in Learner pipelines). From this data (previous piece's output and possibly
+    episodes), a ConnectorV2 then performs a transformation step.
+
+    There are 3 types of pipelines a ConnectorV2 can belong to:
+    1) env-to-module: The connector transforms envrionment data before it gets to the
+    RLModule.
+    2) module-to-env: The connector transforms RLModule outputs before they are sent
+    back to the environment (as actions).
+    3) learner pipeline: The connector transforms data coming directly from an
+    environment sampling step or a replay buffer and will be sent into the RLModule's
+    `forward_train()` method afterwards to compute the loss inputs.
+
+    Some connectors might be stateful, for example for keeping track of observation
+    filtering stats (mean and stddev values). States of all connectors and connector
+    pipelines are frequently being synchronized between the EnvRunners (owning the
+    env-to-module and module-to-env pipelines) and the Learners (owning the Learner
+    pipelines).
     """
 
     # Set these in ALL subclasses.
+    # TODO (sven): Irrelevant for single-agent cases. Once multi-agent is supported
+    #  by ConnectorV2, we need to elaborate more on the different input/output types.
+    #  For single-agent, the types should always be just INPUT_OUTPUT_TYPES.DATA.
     input_type = INPUT_OUTPUT_TYPES.DATA
     output_type = INPUT_OUTPUT_TYPES.DATA
 
@@ -29,7 +55,7 @@ def __init__(self, *, ctx: ConnectorContextV2, **kwargs):
         """Initializes a ConnectorV2 instance.
 
         Args:
-            ctx: The current ConnectorContextV2.
+            ctx: The initial ConnectorContextV2.
             **kwargs: Forward API-compatibility kwargs.
         """
         self.ctx = ctx
@@ -48,13 +74,14 @@ def __call__(
         Args:
             input_: The input data abiding to `self.input_type` to be transformed by
                 this connector. Transformations might either be done in-place or a new
-                structure may be returned that matches `self.output_type`.
+                structure may be returned. The returned data must match
+                `self.output_type`.
             episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects,
-                each corresponding to one slot in the vector env. Note that episodes
-                should always be considered read-only and not be altered.
-            ctx: The ConnectorContext that might be used to pass along other important
-                information in between connector pieces (even across pipelines).
-            kwargs: Forward API-compatibility kwargs.
+                each corresponding to one slot in a gym.vector.Env.
+            ctx: The ConnectorContextV2, containing the current Env, RLModule, and other
+                context-relevant information. It can also be used to pass along
+                information between connector pieces (even across different pipelines).
+            **kwargs: Forward API-compatibility kwargs.
 
         Returns:
             The transformed connector output abiding to `self.output_type`.
@@ -63,31 +90,22 @@ def __call__(
     def __str__(self, indentation: int = 0):
         return " " * indentation + self.__class__.__name__
 
-    # @abc.abstractmethod
-    # def serialize(self) -> Tuple[str, Any]:
-    #    """Serialize a connector into a JSON serializable Tuple.
-
-    #    `serialize()` is required, so that all Connectors are serializable.
+    def get_state(self) -> Dict[str, Any]:
+        """Returns the current state of this ConnectorV2.
 
-    #    Returns:
-    #        A tuple of connector's name and its serialized states.
-    #        String should match the name used to register the connector,
-    #        while state can be any single data structure that contains the
-    #        serialized state of the connector. If a connector is stateless,
-    #        state can simply be None.
-    #    """
+        Used for checkpointing (connectors may be stateful) as well as synchronization
+        between connectors that are run on the (distributed) EnvRunners vs those that
+        run on the (distributed) Learners.
 
-    # @staticmethod
-    # @abc.abstractmethod
-    # def from_state(ctx: ConnectorContextV2, params: Any) -> "ConnectorV2":
-    #    """De-serialize a JSON params back into a Connector.
-
-    #    `from_state()` is required, so that all Connectors are serializable.
+        Returns:
+            A dict mapping str keys to state information.
+        """
+        return {}
 
-    #    Args:
-    #        ctx: ConnectorContextV2 for constructing this connector.
-    #        params: Serialized states of the connector to be recovered.
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Sets the state of this connector to the provided one.
 
-    #    Returns:
-    #        De-serialized connector.
-    #    """
+        Args:
+            state: The new state to set this connector to.
+        """
+        pass
diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py
index 0b9eb2d8669a5..9d7616011b8c7 100644
--- a/rllib/connectors/env_to_module/default_env_to_module.py
+++ b/rllib/connectors/env_to_module/default_env_to_module.py
@@ -15,13 +15,18 @@
 
 @PublicAPI(stability="alpha")
 class DefaultEnvToModule(ConnectorV2):
-    """Default env-to-module-connector always in the pipeline at the very end.
+    """Default connector piece added by RLlib to the end of any env-to-module pipeline.
 
-    Makes sure that there is at least an observation (the most recent one) for each
-    agent as well as a state - in case the RLModule is recurrent. Doesn't do anything
-    in case other pieces in the pipeline already take care of populating these fields.
+    Makes sure that the output data will have at the minimum:
+    a) An observation (the most recent one returned by `env.step()`) under the
+    SampleBatch.OBS key for each agent and
+    b) In case the RLModule is stateful, a STATE_IN key populated with the most recently
+    computed STATE_OUT.
 
-    TODO: Generalize to MultiAgentEpisodes.
+    The connector will not add any new data in case other connector pieces in the
+    pipeline already take care of populating these fields (obs and state in).
+
+    TODO (sven): Generalize to MultiAgentEpisodes.
     """
 
     @override(ConnectorV2)
@@ -31,37 +36,40 @@ def __call__(
         episodes: List[EpisodeType],
         ctx: ConnectorContextV2,
         **kwargs,
-    ):
-        # If obs are not already part of the input, add the most recent ones (from all
-        # single-agent episodes).
+    ) -> Any:
+        # If observations cannot be found in `input`, add the most recent ones (from all
+        # episodes).
         if SampleBatch.OBS not in input_:
+            # Collect all most-recent observations from given episodes.
             observations = []
             for episode in episodes:
-                # Make sure, we have at least one observation in the episode.
-                assert len(episode.observations) > 0
-                observations.append(episode.observations[-1])
+                observations.append(episode.get_observation(indices=-1))
+            # Batch all collected observations together.
             input_[SampleBatch.OBS] = batch(observations)
 
-        # If our module is recurrent:
-        # - Add the most recent states to the inputs.
-        # - Make all inputs have T=1.
+        # If our module is stateful:
+        # - Add the most recent STATE_OUTs to `input_`.
+        # - Make all data in `input_` have a time rank (T=1).
         if ctx.rl_module.is_stateful():
+            # Make all other inputs have an additional T=1 axis.
+            input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_)
+
+            # Collect all most recently computed STATE_OUT (or use initial states from
+            # RLModule if at beginning of episode).
             states = []
             for episode in episodes:
                 # Make sure, we have at least one observation in the episode.
                 assert episode.observations
 
-                # TODO: Generalize to MultiAgentEpisodes.
-                # Episode just started, get initial state from our RLModule.
+                # TODO (sven): Generalize to MultiAgentEpisodes.
+                # Episode just started -> Get initial state from our RLModule.
                 if len(episode) == 0:
                     state = ctx.rl_module.get_initial_state()
+                # Episode is already ongoing -> Use most recent STATE_OUT.
                 else:
                     state = episode.extra_model_outputs[STATE_OUT][-1]
                 states.append(state)
 
-            # Make all other inputs have an additional T=1 axis.
-            input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_)
-
             # Batch states (from list of individual vector sub-env states).
             # Note that state ins should NOT have the extra time dimension.
             input_[STATE_IN] = batch(states)
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
new file mode 100644
index 0000000000000..63630229e57bc
--- /dev/null
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -0,0 +1,47 @@
+from typing import Any, List, Optional
+
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+
+
+class EnvToModulePipeline(ConnectorPipelineV2):
+    def __init__(
+        self,
+        *,
+        ctx: ConnectorContextV2,
+        connectors: Optional[List[ConnectorV2]] = None,
+        **kwargs,
+    ):
+        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+        # Add the default final connector piece for env-to-module pipelines:
+        # Extracting last obs from episodes and add them to input, iff this has not
+        # happened in any connector piece in this pipeline before.
+        if (
+            len(self.connectors) == 0
+            or type(self.connectors[-1]) is not DefaultEnvToModule
+        ):
+            self.append(DefaultEnvToModule(ctx=ctx))
+
+    @override(ConnectorPipelineV2)
+    def __call__(
+        self,
+        *,
+        input_: Optional[Any] = None,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+        **kwargs,
+    ) -> Any:
+        # Make sure user does not have to send initial input into this pipeline.
+        # Might just be empty and to be populated from `episodes`.
+        return super().__call__(
+            input_=input_ or {},
+            episodes=episodes,
+            ctx=ctx,
+            **kwargs,
+        )
+
+
diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py
index 592faa711a4a6..9a636a0fc0d9c 100644
--- a/rllib/connectors/learner/default_learner_connector.py
+++ b/rllib/connectors/learner/default_learner_connector.py
@@ -1,18 +1,24 @@
 from functools import partial
-from typing import Any
+from typing import Any, List
 
 import numpy as np
 import tree
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.connectors.utils.zero_padding import (
+    create_mask_and_seq_lens,
+    split_and_pad,
+    split_and_pad_single_record,
+)
 from ray.rllib.core.models.base import STATE_IN, STATE_OUT
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import EpisodeType
 
 
 class DefaultLearnerConnector(ConnectorV2):
-    """Connector added by default by RLlib to the end of the learner connector pipeline.
+    """Connector added by default by RLlib to the end of any learner connector pipeline.
 
     If provided with `episodes` data, this connector piece makes sure that the final
     train batch going into the RLModule for updating (`forward_train()` call) contains
@@ -21,7 +27,7 @@ class DefaultLearnerConnector(ConnectorV2):
     - Actions, rewards, terminal/truncation flags: From all episodes under the
     respective keys.
     - All data inside the episodes' `extra_model_outs` property, e.g. action logp and
-    action probs.
+    action probs under the respective keys.
     - States: If the RLModule is stateful, the episodes' STATE_OUTS will be extracted
     and restructured under a new STATE_IN key in such a way that the resulting STATE_IN
     batch has the shape (B', ...). Here, B' is the sum of splits we have to do over
@@ -31,23 +37,32 @@ class DefaultLearnerConnector(ConnectorV2):
 
     If the user wants to customize their own data under the given keys (e.g. obs,
     actions, ...), they can extract from the episodes or recompute from `input_`
-    their own data and store it under those keys (in `input_`). In such a case, this
-    connector will not touch the data under these keys.
+    their own data and store it in `input_` under those keys. In this case, the default
+    connector will not change the data under these keys and simply act as a
+    pass-through.
     """
 
-    def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
+    def __call__(
+        self,
+        input_: Any,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+        **kwargs,
+    ) -> Any:
         # If episodes are provided, extract the essential data from them, but only if
-        # this data is not present yet in `input_`.
+        # respective keys are not present yet in `input_`.
         if not episodes:
             return input_
 
-        # Get data dicts for all episodes.
+        # Get the data dicts for all episodes.
         data_dicts = [episode.get_data_dict() for episode in episodes]
 
         state_in = None
         T = ctx.rl_module.config.model_config_dict.get("max_seq_len")
 
-        # Special handling of STATE_OUT/STATE_IN keys:
+        # RLModule is stateful and STATE_IN is not found in `input_` (user's custom
+        # connectors have not provided this information yet) -> Perform separate
+        # handling of STATE_OUT/STATE_IN keys:
         if ctx.rl_module.is_stateful() and STATE_IN not in input_:
             if T is None:
                 raise ValueError(
@@ -57,11 +72,17 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
                     "You can set this dict and/or override keys in it via "
                     "`config.training(model={'max_seq_len': x})`."
                 )
-            # Get model init state.
-            init_state = convert_to_numpy(ctx.rl_module.get_initial_state())
+
+            # Before adding anything to `input_`, add the time axis to existing data.
+            input_ = tree.map_structure(
+                lambda s: split_and_pad_single_record(s, episodes, T=T),
+                input_,
+            )
+
             # Get STATE_OUTs for all episodes and only keep those (as STATE_INs) that
             # are located at the `max_seq_len` edges (state inputs to RNNs only have a
             # B-axis, no T-axis).
+            init_state = convert_to_numpy(ctx.rl_module.get_initial_state())
             state_ins = []
             for episode, data_dict in zip(episodes, data_dicts):
                 # Remove state outs (should not be part of the T-axis rearrangements).
@@ -69,15 +90,16 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
                 state_ins.append(
                     tree.map_structure(
                         # [::T] = only keep every Tth (max_seq_len) state in.
-                        # [:-1] = shift state outs by one (ignore very last state out, but
-                        # therefore add the init state at the beginning).
+                        # [:-1] = shift state outs by one (ignore very last state out,
+                        # but therefore add the init state at the beginning).
                         lambda i, o: np.concatenate([[i], o[:-1]])[::T],
                         (
                             # Episode has a (reset) beginning -> Prepend initial state.
                             init_state
                             if episode.t_started == 0
-                            # Episode starts somewhere in the middle (is a cut continuation
-                            # chunk) -> Use previous chunk's last STATE_OUT as initial state.
+                            # Episode starts somewhere in the middle (is a cut
+                            # continuation chunk) -> Use previous chunk's last STATE_OUT
+                            # as initial state.
                             else episode.get_extra_model_outputs(
                                 key=STATE_OUT, indices=-len(episode) - 1
                             )
@@ -85,21 +107,14 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
                         state_outs,
                     )
                 )
-            # Concatenate the individual episodes' state ins.
+            # Concatenate the individual episodes' STATE_INs.
             state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins)
 
-            # Before adding anything else to the `input_`, add the time axis to existing
-            # data.
-            input_ = tree.map_structure(
-                lambda s: split_and_pad_single_record(s, episodes, T=T),
-                input_,
-            )
-
             # Set the reduce function for all the data we might still have to extract
             # from our list of episodes. This function takes a list of data (e.g. obs)
             # with each item in the list representing one episode and properly
             # splits along the time axis and zero-pads if necessary (based on
-            # max_seq_len).
+            # T=max_seq_len).
             reduce_fn = partial(split_and_pad, T=T)
 
         # No stateful module, normal batch (w/o T-axis or zero-padding).
@@ -109,7 +124,8 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
             # episodes along the batch axis (axis=0).
             reduce_fn = np.concatenate
 
-        # Extract all data from the episodes, if not already in `input_`.
+        # Extract all data from the episodes and add to `input_`, if not already in
+        # `input_`.
         for key in [
             SampleBatch.OBS,
             SampleBatch.ACTIONS,
@@ -126,18 +142,19 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
                     *[d[key] for d in data_dicts],
                 )
 
-        # Infos (always as lists).
-        # TODO:uncomment if SampleBatch.INFOS not in input_:
-        #    input_[SampleBatch.INFOS] = sum(
-        #        [d[SampleBatch.INFOS] for d in data_dicts],
-        #        [],
-        #    )
+        # Handle infos (always lists, not numpy arrays).
+        if SampleBatch.INFOS not in input_:
+            input_[SampleBatch.INFOS] = sum(
+                [d[SampleBatch.INFOS] for d in data_dicts],
+                [],
+            )
 
+        # Now that all "normal" fields are time-dim'd and zero-padded, add
+        # the STATE_IN column to `input_`.
         if ctx.rl_module.is_stateful():
-            # Now that all "normal" fields are time-dim'd and zero-padded, add
-            # the STATE_IN column to `input_`.
             input_[STATE_IN] = state_in
-            # Create the zero-padding loss mask.
+            # Also, create the loss mask (b/c of our now possibly zero-padded data) as
+            # well as the seq_lens array and add these to `input_` as well.
             (
                 input_["loss_mask"],
                 input_[SampleBatch.SEQ_LENS],
@@ -147,66 +164,3 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs):
             )
 
         return input_
-
-
-def split_and_pad(episodes_data, T):
-    all_chunks = []
-
-    for data in episodes_data:
-        num_chunks = int(np.ceil(data.shape[0] / T))
-
-        for i in range(num_chunks):
-            start_index = i * T
-            end_index = start_index + T
-
-            # Extract the chunk
-            chunk = data[start_index:end_index]
-
-            # Pad the chunk if it's shorter than T
-            if chunk.shape[0] < T:
-                padding_shape = [(0, T - chunk.shape[0])] + [
-                    (0, 0) for _ in range(chunk.ndim - 1)
-                ]
-                chunk = np.pad(chunk, pad_width=padding_shape, mode="constant")
-
-            all_chunks.append(chunk)
-
-    # Combine all chunks into a single array
-    result = np.concatenate(all_chunks, axis=0)
-
-    # Reshape the array to include the time dimension T
-    # The new shape should be (-1, T) + original dimensions (excluding the batch dimension)
-    result = result.reshape((-1, T) + result.shape[1:])
-
-    return result
-
-
-def split_and_pad_single_record(data, episodes, T):
-    episodes_data = []
-    idx = 0
-    for episode in episodes:
-        len_ = len(episode)
-        episodes_data.append(data[idx : idx + len_])
-        idx += len_
-    return split_and_pad(episodes_data, T)
-
-
-def create_mask_and_seq_lens(episode_lens, T):
-    mask = []
-    seq_lens = []
-    for episode_len in episode_lens:
-        len_ = min(episode_len, T)
-        seq_lens.append(len_)
-        row = [1] * len_ + [0] * (T - len_)
-        mask.append(row)
-
-        # Handle sequence lengths greater than T.
-        overflow = episode_len - T
-        while overflow > 0:
-            len_ = min(overflow, T)
-            seq_lens.append(len_)
-            extra_row = [1] * len_ + [0] * (T - len_)
-            mask.append(extra_row)
-            overflow -= T
-
-    return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32)
diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py
index 5bf0a2af0c8a4..b3b8f8e181b1a 100644
--- a/rllib/connectors/module_to_env/default_module_to_env.py
+++ b/rllib/connectors/module_to_env/default_module_to_env.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, List
 
 import numpy as np
 import tree  # pip install dm_tree
@@ -8,24 +8,29 @@
 from ray.rllib.core.models.base import STATE_OUT
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
 from ray.util.annotations import PublicAPI
 
 
 @PublicAPI(stability="alpha")
 class DefaultModuleToEnv(ConnectorV2):
-    """A connector that samples actions given action dist. inputs and a dist. class.
+    """Default connector piece added by RLlib to the end of any module-to-env pipeline.
 
-    The connector will only sample from the distribution, if the ACTIONS key
-    cannot be found in the connector's input. Otherwise, it'll behave simply as pass
-    through (noop). If ACTIONS is not present, but ACTION_DIST_INPUTS are, will create
-    a distribution from the RLModule and sample from it (deterministically, if
-    we are not exploring, stochastically, if we are).
+    If necessary, this connector samples actions, given action dist. inputs and a
+    dist. class.
+    The connector will only sample from the action distribution, if the
+    SampleBatch.ACTIONS key cannot be found in `input_`. Otherwise, it'll behave
+    as pass through (noop). If SampleBatch.ACTIONS is not present, but
+    SampleBatch.ACTION_DIST_INPUTS are, the connector will create a new action
+    distribution using the RLModule in the connector context and sample from this
+    distribution (deterministically, if we are not exploring, stochastically, if we
+    are).
 
     input_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA
         Operates per RLModule as it will have to pull the action distribution from each
         in order to sample actions if necessary. Searches for the ACTIONS and
-        ACTION_DIST_INPUTS keys in a module's outputs and - should ACTIONS not be found -
-        sample actions from the module's action distribution.
+        ACTION_DIST_INPUTS keys in a module's outputs and - should ACTIONS not be
+        found - sample actions from the module's action distribution.
     output_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA (same as input: data in,
         data out, however, data
         out might contain an additional ACTIONS key if it was not previously present
@@ -33,8 +38,12 @@ class DefaultModuleToEnv(ConnectorV2):
     """
 
     @override(ConnectorV2)
-    def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any:
-
+    def __call__(
+        self,
+        input_: Any,
+        episodes: List[EpisodeType],
+        ctx: ConnectorContextV2,
+    ) -> Any:
         # Loop through all modules that created some output.
         # for mid in input_.keys():
         #    sa_module = ctx.rl_module.get_module(module_id=mid)
@@ -47,20 +56,20 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any:
             if state:
                 input_[STATE_OUT] = state
 
-        # ACTION_DIST_INPUTS field returned by `forward_exploration()` ->
-        # Create a distribution object.
+        # ACTION_DIST_INPUTS field returned by `forward_exploration|inference()` ->
+        # Create a new action distribution object.
         action_dist = None
-        # The RLModule has already computed actions.
-        if (
-            SampleBatch.ACTION_DIST_INPUTS in input_
-            and SampleBatch.ACTION_LOGP not in input_
-        ):
-            dist_inputs = input_[SampleBatch.ACTION_DIST_INPUTS]
+        if SampleBatch.ACTION_DIST_INPUTS in input_:
             if ctx.explore:
                 action_dist_class = ctx.rl_module.get_exploration_action_dist_cls()
             else:
                 action_dist_class = ctx.rl_module.get_inference_action_dist_cls()
-            action_dist = action_dist_class.from_logits(dist_inputs)
+            action_dist = action_dist_class.from_logits(
+                input_[SampleBatch.ACTION_DIST_INPUTS]
+            )
+
+            # TODO (sven): Should this not already be taken care of by RLModule's
+            #  `get_...action_dist_cls()` methods?
             if not ctx.explore:
                 action_dist = action_dist.to_deterministic()
 
@@ -71,25 +80,16 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any:
         else:
             if action_dist is None:
                 raise KeyError(
-                    "Your RLModule's `forward_[explore|inference]()` methods must "
-                    f"return a dict with either the {SampleBatch.ACTIONS} key or "
-                    f"the {SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
+                    "Your RLModule's `forward_[exploration|inference]()` methods must "
+                    f"return a dict with either the '{SampleBatch.ACTIONS}' key or "
+                    f"the '{SampleBatch.ACTION_DIST_INPUTS}' key in it (or both)!"
                 )
             actions = action_dist.sample()
             input_[SampleBatch.ACTIONS] = actions
 
-        # Compute action-logp and action-prob from distribution and add to
-        # output, if possible.
+        # For convenience and if possible, compute action logp from distribution
+        # and add to output.
         if action_dist is not None and SampleBatch.ACTION_LOGP not in input_:
             input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions)
 
         return input_
-
-    # @override(Connector)
-    # def serialize(self):
-    #    return ClipActions.__name__, None
-
-    # @staticmethod
-    # TODO
-    # def from_state(ctx: ConnectorContext, params: Any):
-    #    return ClipActions(ctx)
diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py
new file mode 100644
index 0000000000000..9b4685db8cfb8
--- /dev/null
+++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py
@@ -0,0 +1,27 @@
+from typing import Any, List, Optional
+
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
+
+
+class ModuleToEnvPipeline(ConnectorPipelineV2):
+    """The superclass for any module-to-env pipelines."""
+    def __init__(
+        self,
+        *,
+        ctx: ConnectorContextV2,
+        connectors: Optional[List[ConnectorV2]] = None,
+        **kwargs,
+    ):
+        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+
+        # Add the default final connector piece for env-to-module pipelines:
+        # Sampling actions from action_dist_inputs and add them to input, iff this has
+        # not happened in any connector piece in this pipeline before.
+        if (
+            len(self.connectors) == 0
+            or type(self.connectors[-1]) is not DefaultModuleToEnv
+        ):
+            self.append(DefaultModuleToEnv(ctx=ctx))
diff --git a/rllib/connectors/utils/__init__.py b/rllib/connectors/utils/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rllib/connectors/utils/zero_padding.py b/rllib/connectors/utils/zero_padding.py
new file mode 100644
index 0000000000000..e34c0eab85cc5
--- /dev/null
+++ b/rllib/connectors/utils/zero_padding.py
@@ -0,0 +1,135 @@
+from typing import List, Tuple
+
+import numpy as np
+
+
+def create_mask_and_seq_lens(
+    episode_lens: List[int],
+    T: int,
+) -> Tuple[np._typing.NDArray, np._typing.NDArray]:
+    """Creates loss mask and a seq_lens array, given a list of episode lengths and T.
+
+    Args:
+        episode_lens: A list of episode lengths to infer the loss mask and seq_lens
+            array from.
+        T: The maximum number of timesteps in each "row", also known as the maximum
+            sequence length (max_seq_len). Episodes are split into chunks that are at
+            most `T` long and remaining timesteps will be zero-padded (and masked out).
+
+    Returns:
+         Tuple consisting of a) the loss mask to use (masking out areas that are past
+         the end of an episode (or rollout), but had to be zero-added due to the added
+         extra time rank (of length T) and b) the array of sequence lengths resulting
+         from splitting the given episodes into chunks of at most `T` timesteps.
+    """
+    mask = []
+    seq_lens = []
+    for episode_len in episode_lens:
+        len_ = min(episode_len, T)
+        seq_lens.append(len_)
+        row = [1] * len_ + [0] * (T - len_)
+        mask.append(row)
+
+        # Handle sequence lengths greater than T.
+        overflow = episode_len - T
+        while overflow > 0:
+            len_ = min(overflow, T)
+            seq_lens.append(len_)
+            extra_row = [1] * len_ + [0] * (T - len_)
+            mask.append(extra_row)
+            overflow -= T
+
+    return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32)
+
+
+def split_and_pad(data_chunks: List[np._typing.NDArray], T: int) -> np._typing.NDArray:
+    """Splits and zero-pads data from episodes into a single ndarray with a fixed T-axis.
+
+    Processes each data chunk in `data_chunks`, coming from one episode by splitting
+    the chunk into smaller sub-chunks, each of a maximum size `T`. If a sub-chunk is
+    smaller than `T`, it is right-padded with zeros to match the desired size T.
+    All sub-chunks are then re-combined (concatenated) into a single ndarray, which is
+    reshaped to include the new time dimension `T` as axis 1 (axis 0 is the batch
+    axis). The resulting output array has dimensions (B=number of sub-chunks, T, ...),
+    where '...' represents the original dimensions of the input data (excluding the
+    batch dimension).
+
+    Args:
+        data_chunks: A list where each element is a NumPy array representing
+            an episode. Each array's shape should be (episode_length, ...)
+            where '...' represents any number of additional dimensions.
+        T: The desired time dimension size for each chunk.
+
+    Returns:
+        A np.ndarray containing the reshaped and padded chunks. The shape of the
+        array will be (B, T, ...) where B is automatically determined by the number
+        of chunks in `data_chunks` and `T`.
+        '...' represents the original dimensions of the input data, excluding the
+        batch dimension.
+    """
+    all_chunks = []
+
+    for data_chunk in data_chunks:
+        num_sub_chunks = int(np.ceil(data_chunk.shape[0] / T))
+
+        for i in range(num_sub_chunks):
+            start_index = i * T
+            end_index = start_index + T
+
+            # Extract the chunk.
+            sub_chunk = data_chunk[start_index:end_index]
+
+            # Pad the chunk if it's shorter than T
+            if sub_chunk.shape[0] < T:
+                padding_shape = [(0, T - sub_chunk.shape[0])] + [
+                    (0, 0) for _ in range(sub_chunk.ndim - 1)
+                ]
+                sub_chunk = np.pad(sub_chunk, pad_width=padding_shape, mode="constant")
+
+            all_chunks.append(sub_chunk)
+
+    # Combine all chunks into a single array.
+    result = np.concatenate(all_chunks, axis=0)
+
+    # Reshape the array to include the time dimension T.
+    # The new shape should be (-1, T) + original dimensions (excluding the
+    # batch dimension).
+    result = result.reshape((-1, T) + result.shape[1:])
+
+    return result
+
+
+def split_and_pad_single_record(
+    data: np._typing.NDArray, episode_lengths: List[int], T: int
+):
+    """See `split_and_pad`, but initial data has already been concatenated over episodes.
+
+    Given an np.ndarray of data that is the result of a concatenation of data chunks
+    coming from different episodes, the lengths of these episodes, as well as the
+    maximum time dimension, split and possibly right-zero-pad this input data, such that
+    the resulting shape of the returned np.ndarray is (B', T, ...), where B' is the
+    number of generated sub-chunks and ... is the original shape of the data (excluding
+    the batch dim). T is the size of the newly inserted time axis (on which zero-padding
+    is applied if necessary).
+
+    Args:
+        data: The single np.ndarray input data to be split, zero-added, and reshaped.
+        episode_lengths: The list of episode lengths, from which `data` was originally
+            concat'd.
+        T: The maximum number of timesteps on the T-axis in the resulting np.ndarray.
+
+    Returns:
+        A single np.ndarray, which contains the same data as `data`, but split into sub-
+        chunks of max. size T (zero-padded if necessary at the end of individual
+        episodes), then reshaped to (B', T, ...).
+    """
+    # Chop up `data` into chunks of max len=T, based on the lengths of the episodes
+    # where this data came from.
+    episodes_data = []
+    idx = 0
+    for episode_len in episode_lengths:
+        episodes_data.append(data[idx : idx + episode_len])
+        idx += episode_len
+    # Send everything through `split_and_pad` to perform the actual splitting into
+    # sub-chunks of max len=T and zero-padding.
+    return split_and_pad(episodes_data, T)

From d3dca2f5ec7e018ba57ff4d98c7c246df6904a27 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 17 Nov 2023 12:29:30 +0100
Subject: [PATCH 03/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |  16 +-
 rllib/connectors/connector_pipeline_v2.py     | 207 +++++++++++-------
 .../env_to_module/env_to_module_pipeline.py   |   6 +-
 .../learner/learner_connector_pipeline.py     |  32 +++
 4 files changed, 177 insertions(+), 84 deletions(-)
 create mode 100644 rllib/connectors/learner/learner_connector_pipeline.py

diff --git a/rllib/BUILD b/rllib/BUILD
index d66ee968470d1..a34b26ce1a7b9 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -696,7 +696,7 @@ py_test(
 
 
 # --------------------------------------------------------------------
-# Connector tests
+# Connector(V1) tests
 # rllib/connector/
 #
 # Tag: connector
@@ -723,6 +723,20 @@ py_test(
     srcs = ["connectors/tests/test_agent.py"]
 )
 
+# --------------------------------------------------------------------
+# ConnectorV2 tests
+# rllib/connector/
+#
+# Tag: connector_v2
+# --------------------------------------------------------------------
+
+py_test(
+    name = "connectors/tests/test_connector_v2",
+    tags = ["team:rllib", "connector_v2"],
+    size = "small",
+    srcs = ["connectors/tests/test_connector_v2.py"]
+)
+
 # --------------------------------------------------------------------
 # Env tests
 # rllib/env/
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index f3d9c36508682..7ae84d0b08c8f 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -40,7 +40,13 @@ def __call__(
         ctx: ConnectorContextV2,
         **kwargs,
     ) -> Any:
-        """"""
+        """In a pipeline, we simply call each of our connector pieces after each other.
+
+        Each connector piece receives as input the output of the previous connector
+        piece in the pipeline.
+        """
+        # Loop through connector pieces and call each one with the output of the
+        # previous one. Thereby, time each connector piece's call.
         ret = input_
         for connector in self.connectors:
             timer = self.timers[str(connector)]
@@ -66,55 +72,92 @@ def remove(self, name_or_class: Union[str, Type]):
         else:
             logger.warning(f"Trying to remove a non-existent connector {name}.")
 
-    def insert_before(self, name_or_class: Union[str, Type], connector: ConnectorV2):
-        """Insert a new connector before connector <name>
+    def insert_before(
+        self,
+        name_or_class: Union[str, Type],
+        connector: ConnectorV2,
+    ) -> ConnectorV2:
+        """Insert a new connector piece before an existing piece (by name or class).
 
         Args:
-            name: name of the connector before which a new connector
+            name_or_class: Name or class of the connector piece before which `connector`
                 will get inserted.
-            connector: a new connector to be inserted.
+            connector: The new connector piece to be inserted.
+
+        Returns:
+            The ConnectorV2 before which `connector` has been inserted.
         """
         idx = -1
         for idx, c in enumerate(self.connectors):
-            if c.__class__.__name__ == name:
+            if (
+                (
+                    isinstance(name_or_class, str)
+                    and c.__class__.__name__ == name_or_class
+                )
+                or (isinstance(name_or_class, type) and c.__class__ is name_or_class)
+            ):
                 break
         if idx < 0:
-            raise ValueError(f"Can not find connector {name}")
+            raise ValueError(
+                f"Can not find connector with name or type '{name_or_class}'!"
+            )
+        next_connector = self.connectors[idx]
+
         self.connectors.insert(idx, connector)
         self._fix_input_output_types()
 
         logger.info(
-            f"Inserted {connector.__class__.__name__} before {name} "
+            f"Inserted {connector.__class__.__name__} before {name_or_class} "
             f"to {self.__class__.__name__}."
         )
+        return next_connector
 
-    def insert_after(self, name_or_class: Union[str, Type], connector: ConnectorV2):
-        """Insert a new connector after connector <name>
+    def insert_after(
+        self,
+        name_or_class: Union[str, Type],
+        connector: ConnectorV2,
+    ) -> ConnectorV2:
+        """Insert a new connector piece after an existing piece (by name or class).
 
         Args:
-            name: name of the connector after which a new connector
+            name_or_class: Name or class of the connector piece after which `connector`
                 will get inserted.
-            connector: a new connector to be inserted.
+            connector: The new connector piece to be inserted.
+
+        Returns:
+            The ConnectorV2 after which `connector` has been inserted.
         """
         idx = -1
         for idx, c in enumerate(self.connectors):
-            if c.__class__.__name__ == name:
+            if (
+                (
+                    isinstance(name_or_class, str)
+                    and c.__class__.__name__ == name_or_class
+                )
+                or (isinstance(name_or_class, type) and c.__class__ is name_or_class)
+            ):
                 break
         if idx < 0:
-            raise ValueError(f"Can not find connector {name}")
+            raise ValueError(
+                f"Can not find connector with name or type '{name_or_class}'!"
+            )
+        prev_connector = self.connectors[idx]
+
         self.connectors.insert(idx + 1, connector)
         self._fix_input_output_types()
 
         logger.info(
-            f"Inserted {connector.__class__.__name__} after {name} "
+            f"Inserted {connector.__class__.__name__} after {name_or_class} "
             f"to {self.__class__.__name__}."
         )
 
-    def prepend(self, connector: ConnectorV2):
-        """Append a new connector at the beginning of a connector pipeline.
+        return prev_connector
+
+    def prepend(self, connector: ConnectorV2) -> None:
+        """Prepend a new connector at the beginning of a connector pipeline.
 
         Args:
-            connector: a new connector to be appended.
+            connector: The new connector piece to be prepended to this pipeline.
         """
         self.connectors.insert(0, connector)
         self._fix_input_output_types()
@@ -124,11 +167,11 @@ def prepend(self, connector: ConnectorV2):
             f"{self.__class__.__name__}."
         )
 
-    def append(self, connector: ConnectorV2):
+    def append(self, connector: ConnectorV2) -> None:
         """Append a new connector at the end of a connector pipeline.
 
         Args:
-            connector: a new connector to be appended.
+            connector: The new connector piece to be appended to this pipeline.
         """
         self.connectors.append(connector)
         self._fix_input_output_types()
@@ -138,34 +181,30 @@ def append(self, connector: ConnectorV2):
             f"{self.__class__.__name__}."
         )
 
-    # @override(ConnectorV2)
-    # def serialize(self):
-    #    children = []
-    #    for c in self.connectors:
-    #        state = c.serialize()
-    #        assert isinstance(state, tuple) and len(state) == 2, (
-    #            "Serialized connector state must be in the format of "
-    #            f"Tuple[name: str, params: Any]. Instead we got {state}"
-    #            f"for connector {c.__name__}."
-    #        )
-    #        children.append(state)
-    #    return ConnectorPipelineV2.__name__, children
-    #
-    # @override(ConnectorV2)
-    # @staticmethod
-    # def from_state(ctx: ConnectorContextV2, params: List[Any]):
-    #    assert (
-    #        type(params) == list
-    #    ), "AgentConnectorPipeline takes a list of connector params."
-    #    connectors = []
-    #    for state in params:
-    #        try:
-    #            name, subparams = state
-    #            connectors.append(get_connector(name, ctx, subparams))
-    #        except Exception as e:
-    #            logger.error(f"Failed to de-serialize connector state: {state}")
-    #            raise e
-    #    return ConnectorPipelineV2(ctx, connectors)
+    @override(ConnectorV2)
+    def get_state(self):
+        children = []
+        for c in self.connectors:
+            state = c.serialize()
+            assert isinstance(state, tuple) and len(state) == 2, (
+                "Serialized connector state must be in the format of "
+                f"Tuple[name: str, params: Any]. Instead we got {state}"
+                f"for connector {c.__name__}."
+            )
+            children.append(state)
+        return ConnectorPipelineV2.__name__, children
+
+    @override(ConnectorV2)
+    def set_state(self, state: Dict[str, Any]):
+        connectors = []
+        for state in params:
+            try:
+                name, subparams = state
+                connectors.append(get_connector(name, ctx, subparams))
+            except Exception as e:
+                logger.error(f"Failed to de-serialize connector state: {state}")
+                raise e
+        return ConnectorPipelineV2(ctx, connectors)
 
     def __str__(self, indentation: int = 0):
         return "\n".join(
@@ -173,44 +212,50 @@ def __str__(self, indentation: int = 0):
             + [c.__str__(indentation + 4) for c in self.connectors]
         )
 
-    def __getitem__(self, key: Union[str, int, type]):
-        """Returns a list of connectors that fit 'key'.
+    def __getitem__(
+        self,
+        key: Union[str, int, Type],
+    ) -> Union[ConnectorV2, List[ConnectorV2]]:
+        """Returns a single ConnectorV2 or list of ConnectorV2s that fit `key`.
 
-        If key is a number n, we return a list with the nth element of this pipeline.
-        If key is a Connector class or a string matching the class name of a
-        Connector class, we return a list of all connectors in this pipeline matching
-        the specified class.
+        If key is an int, we return a single ConnectorV2 at that index in this pipeline.
+        If key is a ConnectorV2 type or a string matching the class name of a
+        ConnectorV2 in this pipeline, we return a list of all ConnectorV2s in this
+        pipeline matching the specified class.
 
         Args:
-            key: The key to index by.
+            key: The key to find or to index by.
 
-        Returns: The Connector at index `key`.
+        Returns:
+            A single ConnectorV2 or a list of ConnectorV2s matching `key`.
         """
-        # In case key is a class
-        if not isinstance(key, str):
-            if isinstance(key, slice):
-                raise NotImplementedError(
-                    "Slicing of ConnectorPipeline is currently not supported."
-                )
-            elif isinstance(key, int):
-                return [self.connectors[key]]
-            elif isinstance(key, type):
-                results = []
-                for c in self.connectors:
-                    if issubclass(c.__class__, key):
-                        results.append(c)
-                return results
-            else:
-                raise NotImplementedError(
-                    "Indexing by {} is currently not supported.".format(type(key))
-                )
-
-        results = []
-        for c in self.connectors:
-            if c.__class__.__name__ == key:
-                results.append(c)
-
-        return results
+        # Key is an int -> Index into pipeline and return.
+        if isinstance(key, int):
+            return self.connectors[key]
+        # Key is a class.
+        elif isinstance(key, type):
+            results = []
+            for c in self.connectors:
+                if issubclass(c.__class__, key):
+                    results.append(c)
+            return results
+        # Key is a string -> Find connector(s) by name.
+        elif isinstance(key, str):
+            results = []
+            for c in self.connectors:
+                if c.name == key:
+                    results.append(c)
+            return results
+        # Slicing not supported (yet).
+        elif isinstance(key, slice):
+            raise NotImplementedError(
+                "Slicing of ConnectorPipelineV2 is currently not supported!"
+            )
+        else:
+            raise NotImplementedError(
+                f"Indexing ConnectorPipelineV2 by {type(key)} is currently not "
+                f"supported!"
+            )
 
     def _fix_input_output_types(self):
         if len(self.connectors) > 0:
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index 63630229e57bc..6d03242a8a38f 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -35,9 +35,11 @@ def __call__(
         ctx: ConnectorContextV2,
         **kwargs,
     ) -> Any:
-        # Make sure user does not have to send initial input into this pipeline.
-        # Might just be empty and to be populated from `episodes`.
+
         return super().__call__(
+            # Make sure user does not have to send initial `input_` into this env-to-module
+            # pipeline. This would be the expected behavior b/c after calling the env,
+            # we don't have any data dict yet, only a list of Episode objects.
             input_=input_ or {},
             episodes=episodes,
             ctx=ctx,
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
new file mode 100644
index 0000000000000..dce1180516d7a
--- /dev/null
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -0,0 +1,32 @@
+from typing import Any, List, Optional
+
+from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+from ray.rllib.connectors.learner.default_learner_connector import (
+    DefaultLearnerConnector
+)
+
+
+class LearnerConnectorPipeline(ConnectorPipelineV2):
+    """The superclass for any module-to-env pipelines."""
+    def __init__(
+        self,
+        *,
+        ctx: ConnectorContextV2,
+        connectors: Optional[List[ConnectorV2]] = None,
+        **kwargs,
+    ):
+        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+
+        # Add the default final connector piece for learner pipelines:
+        # Makes sure observations from episodes are in the train batch as well as
+        # the correct state inputs in case the RLModule is stateful. In the latter case,
+        # also takes care of the time rank and zero padding.
+        if (
+            len(self.connectors) == 0
+            or type(self.connectors[-1]) is not DefaultLearnerConnector
+        ):
+            # Append default learner connector piece at the end.
+            self.append(DefaultLearnerConnector(ctx=ctx))
+

From b0b3c377def4536db32b6bd2826210e4f79bd449 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 14 Dec 2023 12:28:00 +0100
Subject: [PATCH 04/15] LINT

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_pipeline_v2.py        | 16 ++++------------
 .../env_to_module/env_to_module_pipeline.py      |  2 --
 .../learner/learner_connector_pipeline.py        |  4 ++--
 .../module_to_env/module_to_env_pipeline.py      |  1 +
 4 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 7ae84d0b08c8f..d53708b190eac 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -90,12 +90,8 @@ def insert_before(
         idx = -1
         for idx, c in enumerate(self.connectors):
             if (
-                (
-                    isinstance(name_or_class, str)
-                    and c.__class__.__name__ == name_or_class
-                )
-                or (isinstance(name_or_class, type) and c.__class__ is name_or_class)
-            ):
+                isinstance(name_or_class, str) and c.__class__.__name__ == name_or_class
+            ) or (isinstance(name_or_class, type) and c.__class__ is name_or_class):
                 break
         if idx < 0:
             raise ValueError(
@@ -130,12 +126,8 @@ def insert_after(
         idx = -1
         for idx, c in enumerate(self.connectors):
             if (
-                (
-                    isinstance(name_or_class, str)
-                    and c.__class__.__name__ == name_or_class
-                )
-                or (isinstance(name_or_class, type) and c.__class__ is name_or_class)
-            ):
+                isinstance(name_or_class, str) and c.__class__.__name__ == name_or_class
+            ) or (isinstance(name_or_class, type) and c.__class__ is name_or_class):
                 break
         if idx < 0:
             raise ValueError(
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index 6d03242a8a38f..3b985d3944886 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -45,5 +45,3 @@ def __call__(
             ctx=ctx,
             **kwargs,
         )
-
-
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
index dce1180516d7a..5725f2a7a252e 100644
--- a/rllib/connectors/learner/learner_connector_pipeline.py
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -4,12 +4,13 @@
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 from ray.rllib.connectors.learner.default_learner_connector import (
-    DefaultLearnerConnector
+    DefaultLearnerConnector,
 )
 
 
 class LearnerConnectorPipeline(ConnectorPipelineV2):
     """The superclass for any module-to-env pipelines."""
+
     def __init__(
         self,
         *,
@@ -29,4 +30,3 @@ def __init__(
         ):
             # Append default learner connector piece at the end.
             self.append(DefaultLearnerConnector(ctx=ctx))
-
diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py
index 9b4685db8cfb8..b1b3be1d35b48 100644
--- a/rllib/connectors/module_to_env/module_to_env_pipeline.py
+++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py
@@ -8,6 +8,7 @@
 
 class ModuleToEnvPipeline(ConnectorPipelineV2):
     """The superclass for any module-to-env pipelines."""
+
     def __init__(
         self,
         *,

From 4df7dfef805e86f11a2da1b539ce7420f44f76fe Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 14 Dec 2023 13:14:25 +0100
Subject: [PATCH 05/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_context_v2.py      |  70 ---------
 rllib/connectors/connector_pipeline_v2.py     |  10 +-
 rllib/connectors/connector_v2.py              | 119 ++++++++++++----
 .../env_to_module/default_env_to_module.py    |  21 +--
 .../env_to_module/prev_action_prev_reward.py  | 133 ++++++++++++++++++
 .../learner/default_learner_connector.py      | 105 +++++++++++---
 .../module_to_env/default_module_to_env.py    |  30 ++--
 7 files changed, 348 insertions(+), 140 deletions(-)
 delete mode 100644 rllib/connectors/connector_context_v2.py
 create mode 100644 rllib/connectors/env_to_module/prev_action_prev_reward.py

diff --git a/rllib/connectors/connector_context_v2.py b/rllib/connectors/connector_context_v2.py
deleted file mode 100644
index fff114618dd15..0000000000000
--- a/rllib/connectors/connector_context_v2.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Optional
-
-from ray.rllib.core.rl_module.rl_module import RLModule
-from ray.rllib.utils.typing import AgentID, EnvType
-from ray.util.annotations import PublicAPI
-
-
-@PublicAPI(stability="alpha")
-@dataclass
-class ConnectorContextV2:
-    """Information needed by pieces of connector pipeline to communicate with each other.
-
-    ConnectorContextV2 will be passed to each connector (pipeline) call.
-    Also might contain references to the RLModule used, the Env, as well as whether
-    `explore` is True or False (whether forward_exploration or forward_inference was
-    used).
-
-    TODO: Describe use cases, e.g.
-     - state out need to be fed back as state ins.
-     Unless we would like to temporarily store the states in the episode.
-     - agent_to_policy_mappings need to be stored as they might be stochastic. Then the
-     to_env pipeline can properly map back from module (formerly known as policy) IDs
-     to agent IDs.
-
-    Attributes:
-        env: The Env object used to reset/step through in the current Env->Module
-            setup. This will be None in contexts used in a Learner connector pipeline.
-        rl_module: The RLModule used for either action computing forward passes
-            (`forward_exploration|inference()`) in the current Env->Module setup
-            or `forward_train()` calls in a Learner connector pipeline.
-        explore: Whether `explore` is currently on. Per convention, if True, the
-            RLModule's `forward_exploration()` method should be called, if False, the
-            EnvRunner should call `forward_inference()` instead. Should be None inside
-            Learner connector pipelines.
-        agent_id: The (optional) current agent ID that the connector should be
-            creating/extracting data for.
-        episode_index: The (optional) index within the list of SingleAgentEpisodes or
-            MultiAgentEpisodes, which each connector is given in a call, that belongs
-            to the given agent_id.
-        data: Optional additional context data that needs to be exchanged between
-            different Connector pieces and -pipelines.
-
-        TODO (sven): Maybe we should have to AlgorithmConfig here as well.
-    """
-
-    env: Optional[EnvType] = None
-    rl_module: Optional[RLModule] = None
-    explore: Optional[bool] = None
-    data: Optional[Any] = None
-
-    # TODO (sven): Do these have to be here??
-    agent_id: Optional[AgentID] = None
-    episode_index: Optional[int] = None
-
-    def add_data(self, key, value):
-        assert key not in self.data
-        self.data[key] = value
-
-    def get_data(self, key):
-        assert key in self.data
-        return self.data[key]
-
-    def override_data(self, key, value):
-        assert key in self.data
-        self.data[key] = value
-
-    def del_data(self, key):
-        assert key in self.data
-        del self.data[key]
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index d53708b190eac..6e45a5792c695 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -1,12 +1,16 @@
 from collections import defaultdict
 import logging
-from typing import Any, List, Optional, Type, Union
+from typing import Any, List, Optional, Union
+
+import gymnasium as gym
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
 from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
-from ray.rllib.utils.annotations import override
+from ray.rllib.connectors.learner.default_learner_connector import (
+    DefaultLearnerConnector
+)
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.utils.typing import EpisodeType
 from ray.util.annotations import PublicAPI
 from ray.util.timer import _Timer
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index 0c80ab64d2228..b201c804ca2d0 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -1,8 +1,10 @@
 import abc
-from typing import Any, Dict, List, Tuple
+from typing import Any, List, Optional
+
+import gymnasium as gym
 
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.connectors.input_output_types import INPUT_OUTPUT_TYPES
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.utils.typing import EpisodeType
 from ray.util.annotations import PublicAPI
 
@@ -11,37 +13,44 @@
 class ConnectorV2(abc.ABC):
     """Base class defining the API for an individual "connector piece".
 
-    A ConnectorV2 ("connector piece") is usually part of a series of pieces within
-    a "connector pipeline", which in itself also abides to this very API.
+    A ConnectorV2 ("connector piece") is usually part of a whole series of connector
+    pieces within a so-called connector pipeline, which in itself also abides to this
+    very API..
     For example, you might have a connector pipeline consisting of two connector pieces,
     A and B, both instances of subclasses of ConnectorV2 and each one performing a
     particular transformation on their input data. The resulting connector pipeline
     (A->B) itself also abides to this very ConnectorV2 API and could thus be part of yet
     another, higher-level connector pipeline.
 
-    Any ConnectorV2 instances (individual pieces or several connector pieces in a
-    pipeline) must be callable by overriding their `__call__()` method. When called,
-    they take the outputs of a previous connector piece (or an empty dict if there are
-    no previous pieces) as well as all the data collected thus far in the ongoing
-    episode(s) (only applies to connectors used in EnvRunners) or retrieved from a
-    replay buffer or from an environment sampling step (only applies to connectors used
-    in Learner pipelines). From this data (previous piece's output and possibly
-    episodes), a ConnectorV2 then performs a transformation step.
+    Any ConnectorV2 instance (individual pieces or several connector pieces in a
+    pipeline) is a callable and you should override their `__call__()` method.
+    When called, they take the outputs of a previous connector piece (or an empty dict
+    if there are no previous pieces) as well as all the data collected thus far in the
+    ongoing episode(s) (only applies to connectors used in EnvRunners) or retrieved
+    from a replay buffer or from an environment sampling step (only applies to
+    connectors used in Learner pipelines). From this input data, a ConnectorV2 then
+    performs a transformation step.
 
     There are 3 types of pipelines a ConnectorV2 can belong to:
     1) env-to-module: The connector transforms envrionment data before it gets to the
-    RLModule.
+    RLModule. This type of pipeline is used by an EnvRunner for transforming
+    env output data to RLModule readable data (for the next RLModule forward pass).
     2) module-to-env: The connector transforms RLModule outputs before they are sent
-    back to the environment (as actions).
+    back to the environment (as actions). This type of pipeline is used by an EnvRunner
+    to transform RLModule output data to env readable actions (for the next
+    `env.step()` call).
     3) learner pipeline: The connector transforms data coming directly from an
     environment sampling step or a replay buffer and will be sent into the RLModule's
-    `forward_train()` method afterwards to compute the loss inputs.
+    `forward_train()` method afterwards to compute the loss inputs. This type of
+    pipeline is used by a Learner to transform raw training data (a batch or a list of
+    episodes) to RLModule readable training data (for the next RLModule
+    `forward_train()` call).
 
     Some connectors might be stateful, for example for keeping track of observation
-    filtering stats (mean and stddev values). States of all connectors and connector
-    pipelines are frequently being synchronized between the EnvRunners (owning the
-    env-to-module and module-to-env pipelines) and the Learners (owning the Learner
-    pipelines).
+    filtering stats (mean and stddev values). Any Algorithm, which uses connectors is
+    responsible for frequenly synchronizing the states of all connectors and connector
+    pipelines between the EnvRunners (owning the env-to-module and module-to-env
+    pipelines) and the Learners (owning the Learner pipelines).
     """
 
     # Set these in ALL subclasses.
@@ -51,22 +60,66 @@ class ConnectorV2(abc.ABC):
     input_type = INPUT_OUTPUT_TYPES.DATA
     output_type = INPUT_OUTPUT_TYPES.DATA
 
-    def __init__(self, *, ctx: ConnectorContextV2, **kwargs):
+    @property
+    def observation_space(self):
+        return self.input_observation_space
+
+    @observation_space.setter
+    def observation_space(self, value):
+        self.observation_space = value
+
+    @property
+    def action_space(self):
+        return self.input_action_space
+
+    @action_space.setter
+    def action_space(self, value):
+        self.action_space = value
+
+    def __init__(
+        self,
+        *,
+        input_observation_space: Optional[gym.Space],
+        input_action_space: Optional[gym.Space],
+        env: Optional[gym.Env] = None,
+        #rl_module: Optional["RLModule"] = None,
+        **kwargs,
+    ):
         """Initializes a ConnectorV2 instance.
 
         Args:
-            ctx: The initial ConnectorContextV2.
+            env: An optional env object that the connector might need to know about.
+                Note that normally, env-to-module and module-to-env connectors get this
+                information at construction time, but learner connectors won't (b/c
+                Learner objects don't carry an environment object).
+            input_observation_space: The (mandatory) input observation space. This
+                is the space coming from a previous connector piece in the
+                (env-to-module or learner) pipeline or it is directly defined within
+                the used gym.Env.
+            input_action_space: The (mandatory) input action space. This
+                is the space coming from a previous connector piece in the
+                (module-to-env) pipeline or it is directly defined within the used
+                gym.Env.
+            #rl_module: An optional RLModule object that the connector might need to know
+            #    about. Note that normally, only module-to-env connectors get this
+            #    information at construction time, but env-to-module and learner
+            #    connectors won't (b/c they get constructed before the RLModule).
             **kwargs: Forward API-compatibility kwargs.
         """
-        self.ctx = ctx
+        self.input_observation_space = input_observation_space
+        self.input_action_space = input_action_space
+        self.env = env
+        #self.rl_module = rl_module
 
     @abc.abstractmethod
     def __call__(
         self,
         *,
+        rl_module: RLModule,
         input_: Any,
         episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         """Method for transforming input data into output data.
@@ -74,14 +127,20 @@ def __call__(
         Args:
             input_: The input data abiding to `self.input_type` to be transformed by
                 this connector. Transformations might either be done in-place or a new
-                structure may be returned. The returned data must match
-                `self.output_type`.
+                structure may be returned that matches `self.output_type`.
             episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects,
-                each corresponding to one slot in a gym.vector.Env.
-            ctx: The ConnectorContextV2, containing the current Env, RLModule, and other
-                context-relevant information. It can also be used to pass along
-                information between connector pieces (even across different pipelines).
-            **kwargs: Forward API-compatibility kwargs.
+                each corresponding to one slot in the vector env. Note that episodes
+                should always be considered read-only and not be altered.
+            rl_module: An optional RLModule object that the connector might need to know
+                about. Note that normally, only module-to-env connectors get this
+                information at construction time, but env-to-module and learner
+                connectors won't (b/c they get constructed before the RLModule).
+            explore: Whether `explore` is currently on. Per convention, if True, the
+                RLModule's `forward_exploration` method should be called, if False, the
+                EnvRunner should call `forward_inference` instead.
+            persistent_data: Optional additional context data that needs to be exchanged
+                between different Connector pieces and -pipelines.
+            kwargs: Forward API-compatibility kwargs.
 
         Returns:
             The transformed connector output abiding to `self.output_type`.
diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py
index 9d7616011b8c7..8239b5f2c2ebd 100644
--- a/rllib/connectors/env_to_module/default_env_to_module.py
+++ b/rllib/connectors/env_to_module/default_env_to_module.py
@@ -1,11 +1,11 @@
-from typing import Any, List
+from typing import Any, List, Optional
 
 import numpy as np
 
 import tree
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.core.models.base import STATE_IN, STATE_OUT
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.spaces.space_utils import batch
@@ -32,9 +32,12 @@ class DefaultEnvToModule(ConnectorV2):
     @override(ConnectorV2)
     def __call__(
         self,
-        input_: Any,
+        *,
+        rl_module: RLModule,
+        input_: Optional[Any] = None,
         episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # If observations cannot be found in `input`, add the most recent ones (from all
@@ -50,10 +53,7 @@ def __call__(
         # If our module is stateful:
         # - Add the most recent STATE_OUTs to `input_`.
         # - Make all data in `input_` have a time rank (T=1).
-        if ctx.rl_module.is_stateful():
-            # Make all other inputs have an additional T=1 axis.
-            input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_)
-
+        if rl_module.is_stateful():
             # Collect all most recently computed STATE_OUT (or use initial states from
             # RLModule if at beginning of episode).
             states = []
@@ -64,12 +64,15 @@ def __call__(
                 # TODO (sven): Generalize to MultiAgentEpisodes.
                 # Episode just started -> Get initial state from our RLModule.
                 if len(episode) == 0:
-                    state = ctx.rl_module.get_initial_state()
+                    state = rl_module.get_initial_state()
                 # Episode is already ongoing -> Use most recent STATE_OUT.
                 else:
                     state = episode.extra_model_outputs[STATE_OUT][-1]
                 states.append(state)
 
+            # Make all other inputs have an additional T=1 axis.
+            input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_)
+
             # Batch states (from list of individual vector sub-env states).
             # Note that state ins should NOT have the extra time dimension.
             input_[STATE_IN] = batch(states)
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
new file mode 100644
index 0000000000000..cf11edba10298
--- /dev/null
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -0,0 +1,133 @@
+from functools import partial
+import numpy as np
+from typing import Any, List, Optional
+
+import gymnasium as gym
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.spaces.space_utils import batch
+from ray.rllib.utils.typing import EpisodeType
+
+
+class _PrevRewardPrevActionConnector(ConnectorV2):
+    """A connector piece that adds previous rewards and actions to the input."""
+
+    def __init__(
+            self,
+            *,
+            # Base class constructor args.
+            input_observation_space: Optional[gym.Space],
+            input_action_space: Optional[gym.Space],
+            env: Optional[gym.Env] = None,
+            # Specific prev. r/a args.
+            n_prev_actions: int = 1,
+            n_prev_rewards: int = 1,
+            as_learner_connector: bool = False,
+            **kwargs,
+    ):
+        """Initializes a PrevRewardPrevActionConnector instance.
+
+        Args:
+            n_prev_actions: The number of previous actions to include in the output
+                data. Discrete actions are ont-hot'd. If > 1, will concatenate the
+                individual action tensors.
+            n_prev_rewards: The number of previous rewards to include in the output
+                data.
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to a env-to-module pipeline.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            env=env,
+            **kwargs,
+        )
+
+        self.n_prev_actions = n_prev_actions
+        self.n_prev_rewards = n_prev_rewards
+        self.as_learner_connector = as_learner_connector
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        input_: Optional[Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # This is a data-in-data-out connector, so we expect `input_` to be a dict
+        # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule].
+        # We will just extract the most recent rewards and/or most recent actions from
+        # all episodes and store them inside the `input_` data dict.
+
+        prev_a = []
+        prev_r = []
+        for episode in episodes:
+            # TODO (sven): Get rid of this distinction. With the new Episode APIs,
+            #  this should work the same, whether on finalized or non-finalized
+            #  episodes.
+            # Learner connector pipeline. Episodes have been finalized/numpy'ized.
+            if self.as_learner_connector:
+                assert episode.is_finalized
+                # Loop through each timestep in the episode and add the previous n
+                # actions and previous m rewards (based on that timestep) to the batch.
+                for ts in range(len(episode)):
+                    prev_a.append(
+                        episode.get_actions(
+                            # Extract n actions from `ts - n` to `ts` (excluding `ts`).
+                            indices=slice(ts - self.n_prev_actions, ts),
+                            # Make sure negative indices are NOT interpreted as "counting
+                            # from the end", but as absolute indices meaning they refer
+                            # to timesteps before 0 (which is the lookback buffer).
+                            neg_indices_left_of_zero=True,
+                            # In case we are at the very beginning of the episode, e.g.
+                            # ts==0, fill the left side with zero-actions.
+                            fill=0.0,
+                            # Return one-hot arrays for those action components that are
+                            # discrete or multi-discrete.
+                            one_hot_discrete=True,
+                        )
+                    )
+                    # Do the same for rewards.
+                    prev_r.append(
+                        episode.get_rewards(
+                            indices=slice(ts - self.n_prev_rewards, ts),
+                            neg_indices_left_of_zero=True,
+                            fill=0.0,
+                        )
+                    )
+            # Env-to-module pipeline. Episodes still operate on lists.
+            else:
+                assert not episode.is_finalized
+                prev_a.append(
+                    batch(
+                        episode.get_actions(
+                            indices=slice(-self.n_prev_actions, None),
+                            fill=0.0,
+                            one_hot_discrete=True,
+                        )
+                    )
+                )
+                prev_r.append(
+                    np.array(
+                        episode.get_rewards(
+                            indices=slice(-self.n_prev_rewards, None),
+                            fill=0.0,
+                        )
+                    )
+                )
+
+        input_[SampleBatch.PREV_ACTIONS] = batch(prev_a)
+        input_[SampleBatch.PREV_REWARDS] = np.array(prev_r)
+        return input_
+
+
+PrevRewardPrevActionEnvToModule = partial(
+    _PrevRewardPrevActionConnector, as_learner_connector=False
+)
diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py
index 9a636a0fc0d9c..4216f4790b5f3 100644
--- a/rllib/connectors/learner/default_learner_connector.py
+++ b/rllib/connectors/learner/default_learner_connector.py
@@ -1,18 +1,14 @@
 from functools import partial
-from typing import Any, List
+from typing import Any, List, Optional
 
 import numpy as np
 import tree
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
-from ray.rllib.connectors.utils.zero_padding import (
-    create_mask_and_seq_lens,
-    split_and_pad,
-    split_and_pad_single_record,
-)
 from ray.rllib.core.models.base import STATE_IN, STATE_OUT
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import EpisodeType
 
@@ -42,11 +38,15 @@ class DefaultLearnerConnector(ConnectorV2):
     pass-through.
     """
 
+    @override(ConnectorV2)
     def __call__(
         self,
+        *,
+        rl_module: RLModule,
         input_: Any,
         episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # If episodes are provided, extract the essential data from them, but only if
@@ -58,12 +58,12 @@ def __call__(
         data_dicts = [episode.get_data_dict() for episode in episodes]
 
         state_in = None
-        T = ctx.rl_module.config.model_config_dict.get("max_seq_len")
+        T = rl_module.config.model_config_dict.get("max_seq_len")
 
         # RLModule is stateful and STATE_IN is not found in `input_` (user's custom
         # connectors have not provided this information yet) -> Perform separate
         # handling of STATE_OUT/STATE_IN keys:
-        if ctx.rl_module.is_stateful() and STATE_IN not in input_:
+        if rl_module.is_stateful() and STATE_IN not in input_:
             if T is None:
                 raise ValueError(
                     "You are using a stateful RLModule and are not providing custom "
@@ -72,17 +72,11 @@ def __call__(
                     "You can set this dict and/or override keys in it via "
                     "`config.training(model={'max_seq_len': x})`."
                 )
-
-            # Before adding anything to `input_`, add the time axis to existing data.
-            input_ = tree.map_structure(
-                lambda s: split_and_pad_single_record(s, episodes, T=T),
-                input_,
-            )
-
+            # Get model init state.
+            init_state = convert_to_numpy(rl_module.get_initial_state())
             # Get STATE_OUTs for all episodes and only keep those (as STATE_INs) that
             # are located at the `max_seq_len` edges (state inputs to RNNs only have a
             # B-axis, no T-axis).
-            init_state = convert_to_numpy(ctx.rl_module.get_initial_state())
             state_ins = []
             for episode, data_dict in zip(episodes, data_dicts):
                 # Remove state outs (should not be part of the T-axis rearrangements).
@@ -101,7 +95,7 @@ def __call__(
                             # continuation chunk) -> Use previous chunk's last STATE_OUT
                             # as initial state.
                             else episode.get_extra_model_outputs(
-                                key=STATE_OUT, indices=-len(episode) - 1
+                                key=STATE_OUT, indices=-1, neg_indices_left_of_zero=True
                             )
                         ),
                         state_outs,
@@ -110,6 +104,13 @@ def __call__(
             # Concatenate the individual episodes' STATE_INs.
             state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins)
 
+            # Before adding anything else to the `input_`, add the time axis to existing
+            # data.
+            input_ = tree.map_structure(
+                lambda s: split_and_pad_single_record(s, episodes, T=T),
+                input_,
+            )
+
             # Set the reduce function for all the data we might still have to extract
             # from our list of episodes. This function takes a list of data (e.g. obs)
             # with each item in the list representing one episode and properly
@@ -151,7 +152,7 @@ def __call__(
 
         # Now that all "normal" fields are time-dim'd and zero-padded, add
         # the STATE_IN column to `input_`.
-        if ctx.rl_module.is_stateful():
+        if rl_module.is_stateful():
             input_[STATE_IN] = state_in
             # Also, create the loss mask (b/c of our now possibly zero-padded data) as
             # well as the seq_lens array and add these to `input_` as well.
@@ -164,3 +165,67 @@ def __call__(
             )
 
         return input_
+
+
+def split_and_pad(episodes_data, T):
+    all_chunks = []
+
+    for data in episodes_data:
+        num_chunks = int(np.ceil(data.shape[0] / T))
+
+        for i in range(num_chunks):
+            start_index = i * T
+            end_index = start_index + T
+
+            # Extract the chunk
+            chunk = data[start_index:end_index]
+
+            # Pad the chunk if it's shorter than T
+            if chunk.shape[0] < T:
+                padding_shape = [(0, T - chunk.shape[0])] + [
+                    (0, 0) for _ in range(chunk.ndim - 1)
+                ]
+                chunk = np.pad(chunk, pad_width=padding_shape, mode="constant")
+
+            all_chunks.append(chunk)
+
+    # Combine all chunks into a single array
+    result = np.concatenate(all_chunks, axis=0)
+
+    # Reshape the array to include the time dimension T.
+    # The new shape should be (-1, T) + original dimensions (excluding the batch
+    # dimension)
+    result = result.reshape((-1, T) + result.shape[1:])
+
+    return result
+
+
+def split_and_pad_single_record(data, episodes, T):
+    episodes_data = []
+    idx = 0
+    for episode in episodes:
+        len_ = len(episode)
+        episodes_data.append(data[idx : idx + len_])
+        idx += len_
+    return split_and_pad(episodes_data, T)
+
+
+def create_mask_and_seq_lens(episode_lens, T):
+    mask = []
+    seq_lens = []
+    for episode_len in episode_lens:
+        len_ = min(episode_len, T)
+        seq_lens.append(len_)
+        row = [1] * len_ + [0] * (T - len_)
+        mask.append(row)
+
+        # Handle sequence lengths greater than T.
+        overflow = episode_len - T
+        while overflow > 0:
+            len_ = min(overflow, T)
+            seq_lens.append(len_)
+            extra_row = [1] * len_ + [0] * (T - len_)
+            mask.append(extra_row)
+            overflow -= T
+
+    return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32)
diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py
index b3b8f8e181b1a..395225f5d6a64 100644
--- a/rllib/connectors/module_to_env/default_module_to_env.py
+++ b/rllib/connectors/module_to_env/default_module_to_env.py
@@ -1,11 +1,11 @@
-from typing import Any, List
+from typing import Any, List, Optional
 
 import numpy as np
 import tree  # pip install dm_tree
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.core.models.base import STATE_OUT
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import EpisodeType
@@ -40,17 +40,22 @@ class DefaultModuleToEnv(ConnectorV2):
     @override(ConnectorV2)
     def __call__(
         self,
+        *,
+        rl_module: RLModule,
         input_: Any,
         episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
+        **kwargs,
     ) -> Any:
+
         # Loop through all modules that created some output.
         # for mid in input_.keys():
         #    sa_module = ctx.rl_module.get_module(module_id=mid)
 
         # If our RLModule is stateful, remove the T=1 axis from all model outputs
         # (except the state outs, which never have this extra time axis).
-        if ctx.rl_module.is_stateful():
+        if rl_module.is_stateful():
             state = input_.pop(STATE_OUT, None)
             input_ = tree.map_structure(lambda s: np.squeeze(s, axis=1), input_)
             if state:
@@ -60,17 +65,17 @@ def __call__(
         # Create a new action distribution object.
         action_dist = None
         if SampleBatch.ACTION_DIST_INPUTS in input_:
-            if ctx.explore:
-                action_dist_class = ctx.rl_module.get_exploration_action_dist_cls()
+            if explore:
+                action_dist_class = rl_module.get_exploration_action_dist_cls()
             else:
-                action_dist_class = ctx.rl_module.get_inference_action_dist_cls()
+                action_dist_class = rl_module.get_inference_action_dist_cls()
             action_dist = action_dist_class.from_logits(
                 input_[SampleBatch.ACTION_DIST_INPUTS]
             )
 
             # TODO (sven): Should this not already be taken care of by RLModule's
             #  `get_...action_dist_cls()` methods?
-            if not ctx.explore:
+            if not explore:
                 action_dist = action_dist.to_deterministic()
 
         # If `forward_...()` returned actions, use them here as-is.
@@ -93,3 +98,12 @@ def __call__(
             input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions)
 
         return input_
+
+    # @override(Connector)
+    # def serialize(self):
+    #    return ClipActions.__name__, None
+
+    # @staticmethod
+    # TODO
+    # def from_state(ctx: ConnectorContext, params: Any):
+    #    return ClipActions(ctx)

From 1de7ebbd155fd9be415c443348f1a64288f1e87a Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 14 Dec 2023 14:14:47 +0100
Subject: [PATCH 06/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_pipeline_v2.py     | 29 +++++++++----
 .../env_to_module/env_to_module_pipeline.py   | 42 +++++++++++++------
 .../learner/learner_connector_pipeline.py     | 34 +++++++++------
 .../module_to_env/module_to_env_pipeline.py   | 28 +++++++++----
 4 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 6e45a5792c695..7f9336fe710d2 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -11,6 +11,7 @@
     DefaultLearnerConnector
 )
 from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import EpisodeType
 from ray.util.annotations import PublicAPI
 from ray.util.timer import _Timer
@@ -25,11 +26,10 @@ class ConnectorPipelineV2(ConnectorV2):
     def __init__(
         self,
         *,
-        ctx: ConnectorContextV2,
         connectors: Optional[List[ConnectorV2]] = None,
         **kwargs,
     ):
-        super().__init__(ctx=ctx, **kwargs)
+        super().__init__(**kwargs)
 
         self.connectors = connectors or []
         self._fix_input_output_types()
@@ -39,9 +39,11 @@ def __init__(
     @override(ConnectorV2)
     def __call__(
         self,
+        rl_module: RLModule,
         input_: Any,
         episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         """In a pipeline, we simply call each of our connector pieces after each other.
@@ -55,7 +57,14 @@ def __call__(
         for connector in self.connectors:
             timer = self.timers[str(connector)]
             with timer:
-                ret = connector(input_=ret, episodes=episodes, ctx=ctx)
+                ret = connector(
+                    rl_module=rl_module,
+                    input_=ret,
+                    episodes=episodes,
+                    explore=explore,
+                    persistent_data=persistent_data,
+                    **kwargs,
+                )
         return ret
 
     def remove(self, name_or_class: Union[str, Type]):
@@ -66,19 +75,21 @@ def remove(self, name_or_class: Union[str, Type]):
         """
         idx = -1
         for i, c in enumerate(self.connectors):
-            if c.__class__.__name__ == name:
+            if c.__class__.__name__ == name_or_class:
                 idx = i
                 break
         if idx >= 0:
             del self.connectors[idx]
             self._fix_input_output_types()
-            logger.info(f"Removed connector {name} from {self.__class__.__name__}.")
+            logger.info(f"Removed connector {name_or_class} from {self.__class__.__name__}.")
         else:
-            logger.warning(f"Trying to remove a non-existent connector {name}.")
+            logger.warning(
+                f"Trying to remove a non-existent connector {name_or_class}."
+            )
 
     def insert_before(
         self,
-        name_or_class: Union[str, Type],
+        name_or_class: Union[str, type],
         connector: ConnectorV2,
     ) -> ConnectorV2:
         """Insert a new connector piece before an existing piece (by name or class).
@@ -257,6 +268,8 @@ def _fix_input_output_types(self):
         if len(self.connectors) > 0:
             self.input_type = self.connectors[0].input_type
             self.output_type = self.connectors[-1].output_type
+            #self.observation_space = self.connectors[-1].observation_space
+            #self.action_space = self.connectors[-1].action_space
         else:
             self.input_type = None
             self.output_type = None
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index 3b985d3944886..e5b81c254589d 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -1,8 +1,10 @@
 from typing import Any, List, Optional
 
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
+import gymnasium as gym
+
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import EpisodeType
@@ -12,11 +14,21 @@ class EnvToModulePipeline(ConnectorPipelineV2):
     def __init__(
         self,
         *,
-        ctx: ConnectorContextV2,
         connectors: Optional[List[ConnectorV2]] = None,
+        input_observation_space: Optional[gym.Space],
+        input_action_space: Optional[gym.Space],
+        env: Optional[gym.Env] = None,
+        rl_module: Optional["RLModule"] = None,
         **kwargs,
     ):
-        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+        super().__init__(
+            connectors=connectors,
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            env=env,
+            rl_module=rl_module,
+            **kwargs,
+        )
         # Add the default final connector piece for env-to-module pipelines:
         # Extracting last obs from episodes and add them to input, iff this has not
         # happened in any connector piece in this pipeline before.
@@ -24,24 +36,30 @@ def __init__(
             len(self.connectors) == 0
             or type(self.connectors[-1]) is not DefaultEnvToModule
         ):
-            self.append(DefaultEnvToModule(ctx=ctx))
+            self.append(DefaultEnvToModule(
+                input_observation_space=self.observation_space,
+                input_action_space=self.action_space,
+                env=env,
+            ))
 
     @override(ConnectorPipelineV2)
     def __call__(
         self,
         *,
+        rl_module: RLModule,
         input_: Optional[Any] = None,
         episodes: List[EpisodeType],
-        ctx: ConnectorContextV2,
+        explore: bool,
+        persistent_data: Optional[dict] = None,
         **kwargs,
-    ) -> Any:
-
+    ):
+        # Make sure user does not necessarily send initial input into this pipeline.
+        # Might just be empty and to be populated from `episodes`.
         return super().__call__(
-            # Make sure user does not have to send initial `input_` into this env-to-module
-            # pipeline. This would be the expected behavior b/c after calling the env,
-            # we don't have any data dict yet, only a list of Episode objects.
-            input_=input_ or {},
+            rl_module=rl_module,
+            input_=input_ if input_ is not None else {},
             episodes=episodes,
-            ctx=ctx,
+            explore=explore,
+            persistent_data=persistent_data,
             **kwargs,
         )
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
index 5725f2a7a252e..78223f2c92f0e 100644
--- a/rllib/connectors/learner/learner_connector_pipeline.py
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -1,6 +1,7 @@
-from typing import Any, List, Optional
+from typing import List, Optional
+
+import gymnasium as gym
 
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 from ray.rllib.connectors.learner.default_learner_connector import (
@@ -8,25 +9,34 @@
 )
 
 
-class LearnerConnectorPipeline(ConnectorPipelineV2):
-    """The superclass for any module-to-env pipelines."""
-
+class LearnerPipeline(ConnectorPipelineV2):
     def __init__(
         self,
         *,
-        ctx: ConnectorContextV2,
         connectors: Optional[List[ConnectorV2]] = None,
+        input_observation_space: Optional[gym.Space],
+        input_action_space: Optional[gym.Space],
+        env: Optional[gym.Env] = None,
+        rl_module: Optional["RLModule"] = None,
         **kwargs,
     ):
-        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+        super().__init__(
+            connectors=connectors,
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            env=env,
+            rl_module=rl_module,
+            **kwargs,
+        )
 
         # Add the default final connector piece for learner pipelines:
-        # Makes sure observations from episodes are in the train batch as well as
-        # the correct state inputs in case the RLModule is stateful. In the latter case,
-        # also takes care of the time rank and zero padding.
+        # Making sure that we have - at the minimum - observations and that the data
+        # is time-ranked (if we have a stateful model) and properly zero-padded.
         if (
             len(self.connectors) == 0
             or type(self.connectors[-1]) is not DefaultLearnerConnector
         ):
-            # Append default learner connector piece at the end.
-            self.append(DefaultLearnerConnector(ctx=ctx))
+            self.append(DefaultLearnerConnector(
+                input_observation_space=self.observation_space,
+                input_action_space=self.action_space,
+            ))
diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py
index b1b3be1d35b48..a9621c3162c90 100644
--- a/rllib/connectors/module_to_env/module_to_env_pipeline.py
+++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py
@@ -1,22 +1,31 @@
-from typing import Any, List, Optional
+from typing import List, Optional
+
+import gymnasium as gym
 
-from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
 
 
 class ModuleToEnvPipeline(ConnectorPipelineV2):
-    """The superclass for any module-to-env pipelines."""
-
     def __init__(
         self,
         *,
-        ctx: ConnectorContextV2,
         connectors: Optional[List[ConnectorV2]] = None,
+        input_observation_space: Optional[gym.Space],
+        input_action_space: Optional[gym.Space],
+        env: Optional[gym.Env] = None,
+        rl_module: Optional["RLModule"] = None,
         **kwargs,
     ):
-        super().__init__(ctx=ctx, connectors=connectors, **kwargs)
+        super().__init__(
+            connectors=connectors,
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            env=env,
+            rl_module=rl_module,
+            **kwargs,
+        )
 
         # Add the default final connector piece for env-to-module pipelines:
         # Sampling actions from action_dist_inputs and add them to input, iff this has
@@ -25,4 +34,9 @@ def __init__(
             len(self.connectors) == 0
             or type(self.connectors[-1]) is not DefaultModuleToEnv
         ):
-            self.append(DefaultModuleToEnv(ctx=ctx))
+            self.append(DefaultModuleToEnv(
+                input_observation_space=self.observation_space,
+                input_action_space=self.action_space,
+                env=env,
+                rl_module=rl_module,
+            ))

From a9acbee7dec50d2d6cbf948f2bce48d2b8f3c5a2 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 14 Dec 2023 15:54:00 +0100
Subject: [PATCH 07/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py          | 116 +++++++++++++
 rllib/connectors/connector_v2.py              |  18 +-
 .../env_to_module/frame_stacking.py           | 120 +++++++++++++
 .../env_to_module/prev_action_prev_reward.py  |   6 +-
 rllib/connectors/learner/frame_stacking.py    |   8 +
 .../learner/learner_connector_pipeline.py     |   2 +-
 rllib/env/wrappers/atari_wrappers.py          |  20 ++-
 ..._CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt |   0
 .../connectors/connector_v2_frame_stacking.py | 164 ++++++++++++++++++
 9 files changed, 436 insertions(+), 18 deletions(-)
 create mode 100644 rllib/connectors/env_to_module/frame_stacking.py
 create mode 100644 rllib/connectors/learner/frame_stacking.py
 create mode 100644 rllib/examples/connectors/TODO_MOVE_OLD_CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt
 create mode 100644 rllib/examples/connectors/connector_v2_frame_stacking.py

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 1ee0761ae49fd..4d3c7d4de7d8b 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -100,6 +100,7 @@
 
 if TYPE_CHECKING:
     from ray.rllib.algorithms.algorithm import Algorithm
+    from ray.rllib.connectors.connector_v2 import ConnectorV2
     from ray.rllib.core.learner import Learner
     from ray.rllib.evaluation.episode import Episode as OldEpisode
 
@@ -327,6 +328,8 @@ def __init__(self, algo_class=None):
         self.num_envs_per_worker = 1
         self.create_env_on_local_worker = False
         self.enable_connectors = True
+        self._env_to_module_connector = None
+        self._module_to_env_connector = None
         # TODO (sven): Rename into `sample_timesteps` (or `sample_duration`
         #  and `sample_duration_unit` (replacing batch_mode), like we do it
         #  in the evaluation config).
@@ -374,6 +377,7 @@ def __init__(self, algo_class=None):
         except AttributeError:
             pass
 
+        self._learner_connector = None
         self.optimizer = {}
         self.max_requests_in_flight_per_sampler_worker = 2
         self._learner_class = None
@@ -1137,6 +1141,95 @@ class directly. Note that this arg can also be specified via
             logger_creator=self.logger_creator,
         )
 
+    def build_env_to_module_connector(self, env):
+        custom_connectors = []
+
+        # Create an env-to-module connector pipeline (including RLlib's default
+        # env->module connector piece) and return it.
+        if self._env_to_module_connector is not None:
+            val_ = self._env_to_module_connector(env)
+
+            from ray.rllib.connectors.connector_v2 import ConnectorV2
+            from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+
+            if (
+                isinstance(val_, ConnectorV2)
+                and not isinstance(val_, ConnectorPipelineV2)
+            ):
+                custom_connectors = [val_]
+            else:
+                return val_
+
+        from ray.rllib.connectors.env_to_module.env_to_module_pipeline import (
+            EnvToModulePipeline
+        )
+
+        return EnvToModulePipeline(
+            connectors=custom_connectors,
+            input_observation_space=env.single_observation_space,
+            input_action_space=env.single_action_space,
+            env=env,
+        )
+
+    def build_module_to_env_connector(self, env):
+        custom_connectors = []
+
+        # Create a module-to-env connector pipeline (including RLlib's default
+        # module->env connector piece) and return it.
+        if self._module_to_env_connector is not None:
+            val_ = self._module_to_env_connector(env)
+
+            from ray.rllib.connectors.connector_v2 import ConnectorV2
+            from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+
+            if (
+                    isinstance(val_, ConnectorV2)
+                    and not isinstance(val_, ConnectorPipelineV2)
+            ):
+                custom_connectors = [val_]
+            else:
+                return val_
+
+        from ray.rllib.connectors.module_to_env.module_to_env_pipeline import (
+            ModuleToEnvPipeline
+        )
+
+        return ModuleToEnvPipeline(
+            connectors=custom_connectors,
+            input_observation_space=env.single_observation_space,
+            input_action_space=env.single_action_space,
+            env=env,
+        )
+
+    def build_learner_connector(self, input_observation_space, input_action_space):
+        custom_connectors = []
+
+        # Create a learner connector pipeline (including RLlib's default
+        # learner connector piece) and return it.
+        if self._learner_connector is not None:
+            val_ = self._learner_connector(input_observation_space, input_action_space)
+
+            from ray.rllib.connectors.connector_v2 import ConnectorV2
+            from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+
+            if (
+                    isinstance(val_, ConnectorV2)
+                    and not isinstance(val_, ConnectorPipelineV2)
+            ):
+                custom_connectors = [val_]
+            else:
+                return val_
+
+        from ray.rllib.connectors.learner.learner_connector_pipeline import (
+            LearnerConnectorPipeline
+        )
+
+        return LearnerConnectorPipeline(
+            connectors=custom_connectors,
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+        )
+
     def python_environment(
         self,
         *,
@@ -1477,6 +1570,12 @@ def rollouts(
         create_env_on_local_worker: Optional[bool] = NotProvided,
         sample_collector: Optional[Type[SampleCollector]] = NotProvided,
         enable_connectors: Optional[bool] = NotProvided,
+        env_to_module_connector: Optional[
+            Callable[[EnvType], "ConnectorV2"]
+        ] = NotProvided,
+        module_to_env_connector: Optional[
+            Callable[[EnvType, "RLModule"], "ConnectorV2"]
+        ] = NotProvided,
         use_worker_filter_stats: Optional[bool] = NotProvided,
         update_worker_filter_stats: Optional[bool] = NotProvided,
         rollout_fragment_length: Optional[Union[int, str]] = NotProvided,
@@ -1522,6 +1621,11 @@ def rollouts(
             enable_connectors: Use connector based environment runner, so that all
                 preprocessing of obs and postprocessing of actions are done in agent
                 and action connectors.
+            env_to_module_connector: A callable taking an Env as input arg and returning
+                an env-to-module ConnectorV2 (might be a pipeline) object.
+            module_to_env_connector: A callable taking an Env and an RLModule as input
+                args and returning a module-to-env ConnectorV2 (might be a pipeline)
+                object.
             use_worker_filter_stats: Whether to use the workers in the WorkerSet to
                 update the central filters (held by the local worker). If False, stats
                 from the workers will not be used and discarded.
@@ -1609,6 +1713,10 @@ def rollouts(
             self.create_env_on_local_worker = create_env_on_local_worker
         if enable_connectors is not NotProvided:
             self.enable_connectors = enable_connectors
+        if env_to_module_connector is not NotProvided:
+            self._env_to_module_connector = env_to_module_connector
+        if module_to_env_connector is not NotProvided:
+            self._module_to_env_connector = module_to_env_connector
         if use_worker_filter_stats is not NotProvided:
             self.use_worker_filter_stats = use_worker_filter_stats
         if update_worker_filter_stats is not NotProvided:
@@ -1719,6 +1827,9 @@ def training(
         optimizer: Optional[dict] = NotProvided,
         max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided,
         learner_class: Optional[Type["Learner"]] = NotProvided,
+        learner_connector: Optional[
+            Callable[["RLModule"], "ConnectorV2"]
+        ] = NotProvided,
         # Deprecated arg.
         _enable_learner_api: Optional[bool] = NotProvided,
     ) -> "AlgorithmConfig":
@@ -1780,6 +1891,9 @@ def training(
                 in your experiment of timesteps.
             learner_class: The `Learner` class to use for (distributed) updating of the
                 RLModule. Only used when `_enable_new_api_stack=True`.
+            learner_connector: A callable taking an env observation space and an env
+                action space as inputs and returning a learner ConnectorV2 (might be
+                a pipeline) object.
 
         Returns:
             This updated AlgorithmConfig object.
@@ -1824,6 +1938,8 @@ def training(
             )
         if learner_class is not NotProvided:
             self._learner_class = learner_class
+        if learner_connector is not NotProvided:
+            self._learner_connector = learner_connector
 
         return self
 
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index b201c804ca2d0..89e4dca793305 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -79,19 +79,14 @@ def action_space(self, value):
     def __init__(
         self,
         *,
-        input_observation_space: Optional[gym.Space],
-        input_action_space: Optional[gym.Space],
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
         env: Optional[gym.Env] = None,
-        #rl_module: Optional["RLModule"] = None,
         **kwargs,
     ):
         """Initializes a ConnectorV2 instance.
 
         Args:
-            env: An optional env object that the connector might need to know about.
-                Note that normally, env-to-module and module-to-env connectors get this
-                information at construction time, but learner connectors won't (b/c
-                Learner objects don't carry an environment object).
             input_observation_space: The (mandatory) input observation space. This
                 is the space coming from a previous connector piece in the
                 (env-to-module or learner) pipeline or it is directly defined within
@@ -100,16 +95,15 @@ def __init__(
                 is the space coming from a previous connector piece in the
                 (module-to-env) pipeline or it is directly defined within the used
                 gym.Env.
-            #rl_module: An optional RLModule object that the connector might need to know
-            #    about. Note that normally, only module-to-env connectors get this
-            #    information at construction time, but env-to-module and learner
-            #    connectors won't (b/c they get constructed before the RLModule).
+            env: An optional env object that the connector might need to know about.
+                Note that normally, env-to-module and module-to-env connectors get this
+                information at construction time, but learner connectors won't (b/c
+                Learner objects don't carry an environment object).
             **kwargs: Forward API-compatibility kwargs.
         """
         self.input_observation_space = input_observation_space
         self.input_action_space = input_action_space
         self.env = env
-        #self.rl_module = rl_module
 
     @abc.abstractmethod
     def __call__(
diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py
new file mode 100644
index 0000000000000..7d2f2012dc78e
--- /dev/null
+++ b/rllib/connectors/env_to_module/frame_stacking.py
@@ -0,0 +1,120 @@
+from functools import partial
+import numpy as np
+from typing import Any, List, Optional
+
+import gymnasium as gym
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.spaces.space_utils import batch, get_base_struct_from_space
+from ray.rllib.utils.typing import EpisodeType
+
+
+class _FrameStackingConnector(ConnectorV2):
+    """A connector piece that stacks the previous n observations into one."""
+
+    def __init__(
+            self,
+            *,
+            # Base class constructor args.
+            input_observation_space: gym.Space,
+            input_action_space: gym.Space,
+            env: Optional[gym.Env] = None,
+            # Specific framestacking args.
+            num_frames: int = 1,
+            as_learner_connector: bool = False,
+            **kwargs,
+    ):
+        """Initializes a _FrameStackingConnector instance.
+
+        Args:
+            num_frames: The number of observation frames to stack up (into a single
+                observation) for the RLModule's forward pass.
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to a env-to-module pipeline.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            env=env,
+            **kwargs,
+        )
+
+        self.num_frames = num_frames
+        self.as_learner_connector = as_learner_connector
+
+        # Some assumptions: Space is box AND last dim (the stacking one) is 1.
+        assert isinstance(self.observation_space, gym.spaces.Box)
+        assert self.observation_space.shape[-1] == 1
+
+        # Change our observation space according to the given stacking settings.
+        self.observation_space = gym.spaces.Box(
+            low=np.repeat(
+                self.observation_space.low, repeats=self.num_frames, axis=-1
+            ),
+            high=np.repeat(
+                self.observation_space.high, repeats=self.num_frames, axis=-1
+            ),
+            shape=list(self.observation_space.shape)[:-1] + [self.num_frames],
+            dtype=self.observation_space.dtype,
+        )
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        input_: Optional[Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # This is a data-in-data-out connector, so we expect `input_` to be a dict
+        # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule].
+        # We will add to `input_` the last n observations.
+
+        obs = []
+        for episode in episodes:
+
+            # Learner connector pipeline. Episodes have been finalized/numpy'ized.
+            if self.as_learner_connector:
+                # Loop through each timestep in the episode and add the previous n
+                # observations (based on that timestep) to the batch.
+                for ts in range(len(episode)):
+                    obs.append(
+                        episode.get_observations(
+                            # Extract n observations from `ts` to `ts - n`
+                            # (excluding `ts - n`).
+                            indices=slice(ts - self.num_frames + 1, ts + 1),
+                            # Make sure negative indices are NOT interpreted as "counting
+                            # from the end", but as absolute indices meaning they refer
+                            # to timesteps before 0 (which is the lookback buffer).
+                            neg_indices_left_of_zero=True,
+                            # In case we are at the very beginning of the episode, e.g.
+                            # ts==0, fill the left side with zero-observations.
+                            fill=0.0,
+                        )
+                    )
+            # Env-to-module pipeline. Episodes still operate on lists.
+            else:
+                assert not episode.is_finalized
+                obs.append(
+                    batch(
+                        episode.get_observations(
+                            indices=slice(-self.num_frames + 1, None),
+                            fill=0.0,
+                        )
+                    )
+                )
+
+        input_[SampleBatch.OBS] = batch(obs)
+        return input_
+
+
+FrameStackingEnvToModule = partial(
+    _FrameStackingConnector, as_learner_connector=False
+)
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
index cf11edba10298..cb381b6e5e466 100644
--- a/rllib/connectors/env_to_module/prev_action_prev_reward.py
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -19,8 +19,8 @@ def __init__(
             self,
             *,
             # Base class constructor args.
-            input_observation_space: Optional[gym.Space],
-            input_action_space: Optional[gym.Space],
+            input_observation_space: gym.Space,
+            input_action_space: gym.Space,
             env: Optional[gym.Env] = None,
             # Specific prev. r/a args.
             n_prev_actions: int = 1,
@@ -28,7 +28,7 @@ def __init__(
             as_learner_connector: bool = False,
             **kwargs,
     ):
-        """Initializes a PrevRewardPrevActionConnector instance.
+        """Initializes a _PrevRewardPrevActionConnector instance.
 
         Args:
             n_prev_actions: The number of previous actions to include in the output
diff --git a/rllib/connectors/learner/frame_stacking.py b/rllib/connectors/learner/frame_stacking.py
new file mode 100644
index 0000000000000..4eb0c09bd6e41
--- /dev/null
+++ b/rllib/connectors/learner/frame_stacking.py
@@ -0,0 +1,8 @@
+from functools import partial
+
+from ray.rllib.connectors.env_to_module.frame_stacking import _FrameStackingConnector
+
+
+FrameStackingLearner = partial(
+    _FrameStackingConnector, as_learner_connector=True
+)
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
index 78223f2c92f0e..acc9a9a1946a2 100644
--- a/rllib/connectors/learner/learner_connector_pipeline.py
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -9,7 +9,7 @@
 )
 
 
-class LearnerPipeline(ConnectorPipelineV2):
+class LearnerConnectorPipeline(ConnectorPipelineV2):
     def __init__(
         self,
         *,
diff --git a/rllib/env/wrappers/atari_wrappers.py b/rllib/env/wrappers/atari_wrappers.py
index 0dfd74729efae..2919685cf6bc5 100644
--- a/rllib/env/wrappers/atari_wrappers.py
+++ b/rllib/env/wrappers/atari_wrappers.py
@@ -240,6 +240,22 @@ def reset(self, **kwargs):
         return self.env.reset(**kwargs)
 
 
+class NormalizedImageEnv(gym.ObservationWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.observation_space = gym.spaces.Box(
+            -1.0,
+            1.0,
+            shape=self.observation_space.shape,
+            dtype=np.float32,
+        )
+
+    # Divide by scale and center around 0.0, such that observations are in the range
+    # of -1.0 and 1.0.
+    def observation(self, observation):
+        return (observation.astype(np.float32) / 128.0) - 1.0
+
+
 @PublicAPI
 class WarpFrame(gym.ObservationWrapper):
     def __init__(self, env, dim):
@@ -266,8 +282,8 @@ def __init__(self, env, k):
         self.frames = deque([], maxlen=k)
         shp = env.observation_space.shape
         self.observation_space = spaces.Box(
-            low=0,
-            high=255,
+            low=np.repeat(env.observation_space.low, repeats=k, axis=-1),
+            high=np.repeat(env.observation_space.high, repeats=k, axis=-1),
             shape=(shp[0], shp[1], shp[2] * k),
             dtype=env.observation_space.dtype,
         )
diff --git a/rllib/examples/connectors/TODO_MOVE_OLD_CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt b/rllib/examples/connectors/TODO_MOVE_OLD_CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py
new file mode 100644
index 0000000000000..e2227fbb61be6
--- /dev/null
+++ b/rllib/examples/connectors/connector_v2_frame_stacking.py
@@ -0,0 +1,164 @@
+import argparse
+from functools import partial
+import os
+
+import gymnasium as gym
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
+from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
+from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner
+from ray.rllib.env.wrappers.atari_wrappers import (
+    EpisodicLifeEnv,
+    # FrameStack,  # <- we do not want env-based frame stacking
+    MaxAndSkipEnv,
+    NoopResetEnv,
+    NormalizedImageEnv,
+    WarpFrame,  # gray + resize
+)
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray import tune
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--num-frames",
+    type=int,
+    default=4,
+    help="The number of observation frames to stack.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=2000, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=400.0, help="Reward at which we stop training."
+)
+
+
+if __name__ == "__main__":
+    import ray
+    from ray import air, tune
+
+    args = parser.parse_args()
+
+    ray.init()
+
+    # Define our custom connector pipelines.
+    def _make_env_to_module_connector(env):
+        # Create the env-to-module connector. We return an individual connector piece
+        # here, which RLlib will then automatically integrate into a pipeline (and
+        # add its default connector piece to the end of that pipeline).
+        return FrameStackingEnvToModule(
+            input_observation_space=env.single_observation_space,
+            input_action_space=env.single_action_space,
+            env=env,
+            num_frames=args.num_frames,
+        )
+
+    def _make_learner_connector(input_observation_space, input_action_space):
+        # Create the learner connector.
+        return FrameStackingLearner(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            num_frames=args.num_frames,
+        )
+
+    # Create a custom Atari setup (w/o the usual Rllib-hard-coded framestacking in it).
+    # We would like our frame stacking connector to do this job.
+    tune.register_env(
+        "env",
+        (
+            lambda cfg: (
+                EpisodicLifeEnv(  # each life is one episode
+                MaxAndSkipEnv(  # frameskip=4 and take max over these 4 frames
+                NoopResetEnv(  # perform n noops after a reset
+                # partial(FrameStack, k=4)(  # <- no env-based framestacking!
+                NormalizedImageEnv(
+                partial(WarpFrame, dim=64)(  # grayscale + resize
+                partial(gym.wrappers.TimeLimit, max_episode_steps=108000)(
+                    gym.make("ALE/Pong-v5", **dict(
+                        cfg, **{"render_mode": "rgb_array"}
+                    ))
+                )
+                )))))
+            )
+        ),
+    )
+
+    config = (
+        PPOConfig()
+        .framework(args.framework)
+        .environment(
+            "env",
+            env_config={
+                # Make analogous to old v4 + NoFrameskip.
+                "frameskip": 1,
+                "full_action_space": False,
+                "repeat_action_probability": 0.0,
+            },
+            clip_rewards=True,
+        )
+        # Use new API stack ...
+        .experimental(_enable_new_api_stack=True)
+        .rollouts(
+            # ... new EnvRunner and our frame stacking env-to-module connector.
+            env_runner_cls=SingleAgentEnvRunner,
+            env_to_module_connector=_make_env_to_module_connector,
+        )
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+        .training(
+            # Use our frame stacking learner connector.
+            learner_connector=_make_learner_connector,
+
+            lambda_=0.95,
+            kl_coeff=0.5,
+            clip_param=0.1,
+            vf_clip_param=10.0,
+            entropy_coeff=0.01,
+            num_sgd_iter=10,
+            lr=0.00025,  # needs to be adjusted: `lr=0.00025*num_learner_workers`
+            grad_clip=100.0,
+            grad_clip_by="global_norm",
+            model={
+                "vf_share_layers": True,
+                "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
+                "conv_activation": "relu",
+                "post_fcnet_hiddens": [256],
+            },
+        )
+    )
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    tuner = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(stop=stop),
+        tune_config=tune.TuneConfig(num_samples=1),
+    )
+    results = tuner.fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()

From 5fe97e1aab7f3af1169a0527eae2129b82a311f8 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 14 Dec 2023 15:55:47 +0100
Subject: [PATCH 08/15] LINT

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py          | 21 ++++++--------
 rllib/connectors/connector_pipeline_v2.py     | 10 ++++---
 .../env_to_module/env_to_module_pipeline.py   | 12 ++++----
 .../env_to_module/frame_stacking.py           | 28 ++++++++----------
 .../env_to_module/prev_action_prev_reward.py  | 22 +++++++-------
 rllib/connectors/learner/frame_stacking.py    |  4 +--
 .../learner/learner_connector_pipeline.py     | 10 ++++---
 .../module_to_env/module_to_env_pipeline.py   | 14 +++++----
 .../connectors/connector_v2_frame_stacking.py | 29 +++++++++++--------
 9 files changed, 77 insertions(+), 73 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 4d3c7d4de7d8b..0c18a4dfc3fd9 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -1152,16 +1152,15 @@ def build_env_to_module_connector(self, env):
             from ray.rllib.connectors.connector_v2 import ConnectorV2
             from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 
-            if (
-                isinstance(val_, ConnectorV2)
-                and not isinstance(val_, ConnectorPipelineV2)
+            if isinstance(val_, ConnectorV2) and not isinstance(
+                val_, ConnectorPipelineV2
             ):
                 custom_connectors = [val_]
             else:
                 return val_
 
         from ray.rllib.connectors.env_to_module.env_to_module_pipeline import (
-            EnvToModulePipeline
+            EnvToModulePipeline,
         )
 
         return EnvToModulePipeline(
@@ -1182,16 +1181,15 @@ def build_module_to_env_connector(self, env):
             from ray.rllib.connectors.connector_v2 import ConnectorV2
             from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 
-            if (
-                    isinstance(val_, ConnectorV2)
-                    and not isinstance(val_, ConnectorPipelineV2)
+            if isinstance(val_, ConnectorV2) and not isinstance(
+                val_, ConnectorPipelineV2
             ):
                 custom_connectors = [val_]
             else:
                 return val_
 
         from ray.rllib.connectors.module_to_env.module_to_env_pipeline import (
-            ModuleToEnvPipeline
+            ModuleToEnvPipeline,
         )
 
         return ModuleToEnvPipeline(
@@ -1212,16 +1210,15 @@ def build_learner_connector(self, input_observation_space, input_action_space):
             from ray.rllib.connectors.connector_v2 import ConnectorV2
             from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 
-            if (
-                    isinstance(val_, ConnectorV2)
-                    and not isinstance(val_, ConnectorPipelineV2)
+            if isinstance(val_, ConnectorV2) and not isinstance(
+                val_, ConnectorPipelineV2
             ):
                 custom_connectors = [val_]
             else:
                 return val_
 
         from ray.rllib.connectors.learner.learner_connector_pipeline import (
-            LearnerConnectorPipeline
+            LearnerConnectorPipeline,
         )
 
         return LearnerConnectorPipeline(
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 7f9336fe710d2..331e6294e5e58 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -8,7 +8,7 @@
 from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
 from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
 from ray.rllib.connectors.learner.default_learner_connector import (
-    DefaultLearnerConnector
+    DefaultLearnerConnector,
 )
 from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.utils.annotations import override
@@ -81,7 +81,9 @@ def remove(self, name_or_class: Union[str, Type]):
         if idx >= 0:
             del self.connectors[idx]
             self._fix_input_output_types()
-            logger.info(f"Removed connector {name_or_class} from {self.__class__.__name__}.")
+            logger.info(
+                f"Removed connector {name_or_class} from {self.__class__.__name__}."
+            )
         else:
             logger.warning(
                 f"Trying to remove a non-existent connector {name_or_class}."
@@ -268,8 +270,8 @@ def _fix_input_output_types(self):
         if len(self.connectors) > 0:
             self.input_type = self.connectors[0].input_type
             self.output_type = self.connectors[-1].output_type
-            #self.observation_space = self.connectors[-1].observation_space
-            #self.action_space = self.connectors[-1].action_space
+            # self.observation_space = self.connectors[-1].observation_space
+            # self.action_space = self.connectors[-1].action_space
         else:
             self.input_type = None
             self.output_type = None
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index e5b81c254589d..b0f1027799a9f 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -36,11 +36,13 @@ def __init__(
             len(self.connectors) == 0
             or type(self.connectors[-1]) is not DefaultEnvToModule
         ):
-            self.append(DefaultEnvToModule(
-                input_observation_space=self.observation_space,
-                input_action_space=self.action_space,
-                env=env,
-            ))
+            self.append(
+                DefaultEnvToModule(
+                    input_observation_space=self.observation_space,
+                    input_action_space=self.action_space,
+                    env=env,
+                )
+            )
 
     @override(ConnectorPipelineV2)
     def __call__(
diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py
index 7d2f2012dc78e..090d9fcda2205 100644
--- a/rllib/connectors/env_to_module/frame_stacking.py
+++ b/rllib/connectors/env_to_module/frame_stacking.py
@@ -17,16 +17,16 @@ class _FrameStackingConnector(ConnectorV2):
     """A connector piece that stacks the previous n observations into one."""
 
     def __init__(
-            self,
-            *,
-            # Base class constructor args.
-            input_observation_space: gym.Space,
-            input_action_space: gym.Space,
-            env: Optional[gym.Env] = None,
-            # Specific framestacking args.
-            num_frames: int = 1,
-            as_learner_connector: bool = False,
-            **kwargs,
+        self,
+        *,
+        # Base class constructor args.
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+        env: Optional[gym.Env] = None,
+        # Specific framestacking args.
+        num_frames: int = 1,
+        as_learner_connector: bool = False,
+        **kwargs,
     ):
         """Initializes a _FrameStackingConnector instance.
 
@@ -52,9 +52,7 @@ def __init__(
 
         # Change our observation space according to the given stacking settings.
         self.observation_space = gym.spaces.Box(
-            low=np.repeat(
-                self.observation_space.low, repeats=self.num_frames, axis=-1
-            ),
+            low=np.repeat(self.observation_space.low, repeats=self.num_frames, axis=-1),
             high=np.repeat(
                 self.observation_space.high, repeats=self.num_frames, axis=-1
             ),
@@ -115,6 +113,4 @@ def __call__(
         return input_
 
 
-FrameStackingEnvToModule = partial(
-    _FrameStackingConnector, as_learner_connector=False
-)
+FrameStackingEnvToModule = partial(_FrameStackingConnector, as_learner_connector=False)
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
index cb381b6e5e466..a7284dd582377 100644
--- a/rllib/connectors/env_to_module/prev_action_prev_reward.py
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -16,17 +16,17 @@ class _PrevRewardPrevActionConnector(ConnectorV2):
     """A connector piece that adds previous rewards and actions to the input."""
 
     def __init__(
-            self,
-            *,
-            # Base class constructor args.
-            input_observation_space: gym.Space,
-            input_action_space: gym.Space,
-            env: Optional[gym.Env] = None,
-            # Specific prev. r/a args.
-            n_prev_actions: int = 1,
-            n_prev_rewards: int = 1,
-            as_learner_connector: bool = False,
-            **kwargs,
+        self,
+        *,
+        # Base class constructor args.
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+        env: Optional[gym.Env] = None,
+        # Specific prev. r/a args.
+        n_prev_actions: int = 1,
+        n_prev_rewards: int = 1,
+        as_learner_connector: bool = False,
+        **kwargs,
     ):
         """Initializes a _PrevRewardPrevActionConnector instance.
 
diff --git a/rllib/connectors/learner/frame_stacking.py b/rllib/connectors/learner/frame_stacking.py
index 4eb0c09bd6e41..f53a62bd6a726 100644
--- a/rllib/connectors/learner/frame_stacking.py
+++ b/rllib/connectors/learner/frame_stacking.py
@@ -3,6 +3,4 @@
 from ray.rllib.connectors.env_to_module.frame_stacking import _FrameStackingConnector
 
 
-FrameStackingLearner = partial(
-    _FrameStackingConnector, as_learner_connector=True
-)
+FrameStackingLearner = partial(_FrameStackingConnector, as_learner_connector=True)
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
index acc9a9a1946a2..766654815ce4c 100644
--- a/rllib/connectors/learner/learner_connector_pipeline.py
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -36,7 +36,9 @@ def __init__(
             len(self.connectors) == 0
             or type(self.connectors[-1]) is not DefaultLearnerConnector
         ):
-            self.append(DefaultLearnerConnector(
-                input_observation_space=self.observation_space,
-                input_action_space=self.action_space,
-            ))
+            self.append(
+                DefaultLearnerConnector(
+                    input_observation_space=self.observation_space,
+                    input_action_space=self.action_space,
+                )
+            )
diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py
index a9621c3162c90..130ed813f6f78 100644
--- a/rllib/connectors/module_to_env/module_to_env_pipeline.py
+++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py
@@ -34,9 +34,11 @@ def __init__(
             len(self.connectors) == 0
             or type(self.connectors[-1]) is not DefaultModuleToEnv
         ):
-            self.append(DefaultModuleToEnv(
-                input_observation_space=self.observation_space,
-                input_action_space=self.action_space,
-                env=env,
-                rl_module=rl_module,
-            ))
+            self.append(
+                DefaultModuleToEnv(
+                    input_observation_space=self.observation_space,
+                    input_action_space=self.action_space,
+                    env=env,
+                    rl_module=rl_module,
+                )
+            )
diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py
index e2227fbb61be6..93609f22cbd4f 100644
--- a/rllib/examples/connectors/connector_v2_frame_stacking.py
+++ b/rllib/examples/connectors/connector_v2_frame_stacking.py
@@ -17,7 +17,6 @@
     WarpFrame,  # gray + resize
 )
 from ray.rllib.utils.test_utils import check_learning_achieved
-from ray import tune
 
 
 parser = argparse.ArgumentParser()
@@ -86,17 +85,24 @@ def _make_learner_connector(input_observation_space, input_action_space):
         (
             lambda cfg: (
                 EpisodicLifeEnv(  # each life is one episode
-                MaxAndSkipEnv(  # frameskip=4 and take max over these 4 frames
-                NoopResetEnv(  # perform n noops after a reset
-                # partial(FrameStack, k=4)(  # <- no env-based framestacking!
-                NormalizedImageEnv(
-                partial(WarpFrame, dim=64)(  # grayscale + resize
-                partial(gym.wrappers.TimeLimit, max_episode_steps=108000)(
-                    gym.make("ALE/Pong-v5", **dict(
-                        cfg, **{"render_mode": "rgb_array"}
-                    ))
+                    MaxAndSkipEnv(  # frameskip=4 and take max over these 4 frames
+                        NoopResetEnv(  # perform n noops after a reset
+                            # partial(FrameStack, k=4)(  # <- no env-based framestacking!
+                            NormalizedImageEnv(
+                                partial(WarpFrame, dim=64)(  # grayscale + resize
+                                    partial(
+                                        gym.wrappers.TimeLimit, max_episode_steps=108000
+                                    )(
+                                        gym.make(
+                                            "ALE/Pong-v5",
+                                            **dict(cfg, **{"render_mode": "rgb_array"})
+                                        )
+                                    )
+                                )
+                            )
+                        )
+                    )
                 )
-                )))))
             )
         ),
     )
@@ -125,7 +131,6 @@ def _make_learner_connector(input_observation_space, input_action_space):
         .training(
             # Use our frame stacking learner connector.
             learner_connector=_make_learner_connector,
-
             lambda_=0.95,
             kl_coeff=0.5,
             clip_param=0.1,

From 213f0d122efe16325cd9275d32bc3b05f401eed1 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 14 Dec 2023 16:18:27 +0100
Subject: [PATCH 09/15] LINT

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py          |   1 +
 rllib/connectors/connector_pipeline_v2.py     |  15 +--
 rllib/connectors/connector_v2.py              |   2 +-
 .../env_to_module/frame_stacking.py           |  14 +--
 .../env_to_module/prev_action_prev_reward.py  |  13 ++-
 .../learner/learner_connector_pipeline.py     |   3 +-
 .../module_to_env/module_to_env_pipeline.py   |   3 +-
 .../tests/test_from_module_connectors.py      | 106 ------------------
 .../connectors/connector_v2_frame_stacking.py |   2 +-
 9 files changed, 27 insertions(+), 132 deletions(-)
 delete mode 100644 rllib/connectors/tests/test_from_module_connectors.py

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 0c18a4dfc3fd9..4c181be809ea8 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -102,6 +102,7 @@
     from ray.rllib.algorithms.algorithm import Algorithm
     from ray.rllib.connectors.connector_v2 import ConnectorV2
     from ray.rllib.core.learner import Learner
+    from ray.rllib.core.rl_module.rl_module import RLModule
     from ray.rllib.evaluation.episode import Episode as OldEpisode
 
 logger = logging.getLogger(__name__)
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 331e6294e5e58..893dfcb57b49e 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -1,15 +1,8 @@
 from collections import defaultdict
 import logging
-from typing import Any, List, Optional, Union
-
-import gymnasium as gym
+from typing import Any, Dict, List, Optional, Type, Union
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
-from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
-from ray.rllib.connectors.learner.default_learner_connector import (
-    DefaultLearnerConnector,
-)
 from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import EpisodeType
@@ -215,7 +208,7 @@ def set_state(self, state: Dict[str, Any]):
                 raise e
         return ConnectorPipelineV2(ctx, connectors)
 
-    def __str__(self, indentation: int = 0):
+    def __repr__(self, indentation: int = 0):
         return "\n".join(
             [" " * indentation + self.__class__.__name__]
             + [c.__str__(indentation + 4) for c in self.connectors]
@@ -270,6 +263,10 @@ def _fix_input_output_types(self):
         if len(self.connectors) > 0:
             self.input_type = self.connectors[0].input_type
             self.output_type = self.connectors[-1].output_type
+            # TODO (sven): Create some examples for pipelines, in which the spaces
+            #  are changed several times by the individual pieces.
+            # self.input_observation_space = self.connectors[0].input_observation_space
+            # self.input_action_space = self.connectors[0].input_action_space
             # self.observation_space = self.connectors[-1].observation_space
             # self.action_space = self.connectors[-1].action_space
         else:
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index 89e4dca793305..2ce20dd871b6f 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -1,5 +1,5 @@
 import abc
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional
 
 import gymnasium as gym
 
diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py
index 090d9fcda2205..c6ac262da0ae7 100644
--- a/rllib/connectors/env_to_module/frame_stacking.py
+++ b/rllib/connectors/env_to_module/frame_stacking.py
@@ -3,13 +3,12 @@
 from typing import Any, List, Optional
 
 import gymnasium as gym
-import tree  # pip install dm_tree
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
-from ray.rllib.utils.spaces.space_utils import batch, get_base_struct_from_space
+from ray.rllib.utils.spaces.space_utils import batch
 from ray.rllib.utils.typing import EpisodeType
 
 
@@ -72,8 +71,8 @@ def __call__(
         **kwargs,
     ) -> Any:
         # This is a data-in-data-out connector, so we expect `input_` to be a dict
-        # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule].
-        # We will add to `input_` the last n observations.
+        # with: key=column name, e.g. "obs" and value=[data to be processed by
+        # RLModule]. We will add to `input_` the last n observations.
 
         obs = []
         for episode in episodes:
@@ -88,9 +87,10 @@ def __call__(
                             # Extract n observations from `ts` to `ts - n`
                             # (excluding `ts - n`).
                             indices=slice(ts - self.num_frames + 1, ts + 1),
-                            # Make sure negative indices are NOT interpreted as "counting
-                            # from the end", but as absolute indices meaning they refer
-                            # to timesteps before 0 (which is the lookback buffer).
+                            # Make sure negative indices are NOT interpreted as
+                            # "counting from the end", but as absolute indices meaning
+                            # they refer to timesteps before 0 (which is the lookback
+                            # buffer).
                             neg_indices_left_of_zero=True,
                             # In case we are at the very beginning of the episode, e.g.
                             # ts==0, fill the left side with zero-observations.
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
index a7284dd582377..7f0caea909e29 100644
--- a/rllib/connectors/env_to_module/prev_action_prev_reward.py
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -62,9 +62,9 @@ def __call__(
         **kwargs,
     ) -> Any:
         # This is a data-in-data-out connector, so we expect `input_` to be a dict
-        # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule].
-        # We will just extract the most recent rewards and/or most recent actions from
-        # all episodes and store them inside the `input_` data dict.
+        # with: key=column name, e.g. "obs" and value=[data to be processed by
+        # RLModule]. We will just extract the most recent rewards and/or most recent
+        # actions from all episodes and store them inside the `input_` data dict.
 
         prev_a = []
         prev_r = []
@@ -82,9 +82,10 @@ def __call__(
                         episode.get_actions(
                             # Extract n actions from `ts - n` to `ts` (excluding `ts`).
                             indices=slice(ts - self.n_prev_actions, ts),
-                            # Make sure negative indices are NOT interpreted as "counting
-                            # from the end", but as absolute indices meaning they refer
-                            # to timesteps before 0 (which is the lookback buffer).
+                            # Make sure negative indices are NOT interpreted as
+                            # "counting from the end", but as absolute indices meaning
+                            # they refer to timesteps before 0 (which is the lookback
+                            # buffer).
                             neg_indices_left_of_zero=True,
                             # In case we are at the very beginning of the episode, e.g.
                             # ts==0, fill the left side with zero-actions.
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
index 766654815ce4c..88a1ad49c02d1 100644
--- a/rllib/connectors/learner/learner_connector_pipeline.py
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -7,6 +7,7 @@
 from ray.rllib.connectors.learner.default_learner_connector import (
     DefaultLearnerConnector,
 )
+from ray.rllib.core.rl_module.rl_module import RLModule
 
 
 class LearnerConnectorPipeline(ConnectorPipelineV2):
@@ -17,7 +18,7 @@ def __init__(
         input_observation_space: Optional[gym.Space],
         input_action_space: Optional[gym.Space],
         env: Optional[gym.Env] = None,
-        rl_module: Optional["RLModule"] = None,
+        rl_module: Optional[RLModule] = None,
         **kwargs,
     ):
         super().__init__(
diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py
index 130ed813f6f78..2abcecf439d57 100644
--- a/rllib/connectors/module_to_env/module_to_env_pipeline.py
+++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py
@@ -4,6 +4,7 @@
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
 
 
@@ -15,7 +16,7 @@ def __init__(
         input_observation_space: Optional[gym.Space],
         input_action_space: Optional[gym.Space],
         env: Optional[gym.Env] = None,
-        rl_module: Optional["RLModule"] = None,
+        rl_module: Optional[RLModule] = None,
         **kwargs,
     ):
         super().__init__(
diff --git a/rllib/connectors/tests/test_from_module_connectors.py b/rllib/connectors/tests/test_from_module_connectors.py
deleted file mode 100644
index ac0844ff46f0f..0000000000000
--- a/rllib/connectors/tests/test_from_module_connectors.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import unittest
-
-import gymnasium as gym
-import numpy as np
-
-from ray.rllib.connectors.into_env.clip_actions import ClipActions
-from ray.rllib.connectors.into_env.unsquash_actions import UnsquashActions
-from ray.rllib.connectors.connector import ConnectorContextV2
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.test_utils import check
-
-
-class TestFromModuleConnectors(unittest.TestCase):
-    def test_connector_pipeline(self):
-        ctx = ConnectorContext()
-        connectors = [ConvertToNumpyConnector(ctx)]
-        pipeline = ActionConnectorPipeline(ctx, connectors)
-        name, params = pipeline.serialize()
-        restored = get_connector(name, ctx, params)
-        self.assertTrue(isinstance(restored, ActionConnectorPipeline))
-        self.assertTrue(isinstance(restored.connectors[0], ConvertToNumpyConnector))
-        # There should not be any timer yet
-        self.assertFalse(bool(pipeline.timers.values()))
-        pipeline(ActionConnectorDataType(0, 0, {}, ([1], [], None)))
-        # After a first input, there should be one timer
-        self.assertEquals(len(pipeline.timers.values()), 1)
-
-    def test_clip_actions_connector(self):
-        ctx = ConnectorContextV2()
-
-        connector = ClipActions(
-            action_space=gym.spaces.Box(low=0.0, high=6.0, shape=(1,))
-        )
-
-        # name, params = connector.serialize()
-        # self.assertEqual(name, "ClipActions")
-
-        # restored = get_connector(name, ctx, params)
-        # self.assertTrue(isinstance(restored, ClipActionsConnector))
-
-        for action in [8.8, 6.0, -0.2, 0.0, 5.9999, 3.2, 6.1]:
-            output = connector(
-                {SampleBatch.ACTIONS: np.array([action])},
-                ctx,
-            )
-            check(output[SampleBatch.ACTIONS], np.clip(action, 0.0, 6.0))
-
-        connector = ClipActions(
-            action_space=gym.spaces.Dict(
-                {
-                    "a": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,)),
-                    "b": gym.spaces.Discrete(3),
-                }
-            )
-        )
-        for action in [
-            {"a": np.array([8.8, 8.9]), "b": 1},
-            {"a": np.array([9.0, -1.0]), "b": 0},
-            {"a": np.array([100.0, 200.0]), "b": 2},
-            {"a": np.array([-1000, 0.0001]), "b": 2},
-            {"a": np.array([0.4, 1.2]), "b": 0},
-            {"a": np.array([1.0, -1.0]), "b": 1},
-        ]:
-            output = connector({SampleBatch.ACTIONS: action}, ctx)
-            check(
-                output[SampleBatch.ACTIONS],
-                {"a": np.clip(action["a"], -1.0, 1.0), "b": action["b"]},
-            )
-
-    def test_unsquash_actions_connector(self):
-        ctx = ConnectorContextV2()
-
-        connector = UnsquashActions(
-            action_space=gym.spaces.Box(low=-2.0, high=6.0, shape=(2,))
-        )
-
-        # name, params = connector.serialize()
-        # self.assertEqual(name, "UnsquashActions")
-
-        # restored = get_connector(name, ctx, params)
-        # self.assertTrue(isinstance(restored, NormalizeActionsConnector))
-
-        for action in [
-            [1.8, 1.8],
-            [1.0, -1.0],
-            [-1.0, 1.1],
-            [0.0, 0.0],
-            [10.0, 0.5],
-            [0.5, -0.5],
-        ]:
-            action = np.array(action)
-            output = connector(
-                {SampleBatch.ACTIONS: action},
-                ctx,
-            )
-            check(
-                output[SampleBatch.ACTIONS],
-                np.clip((action + 1.0) * 4.0 - 2.0, -2.0, 6.0),
-            )
-
-
-if __name__ == "__main__":
-    import pytest
-    import sys
-
-    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py
index 93609f22cbd4f..ab45623a5562a 100644
--- a/rllib/examples/connectors/connector_v2_frame_stacking.py
+++ b/rllib/examples/connectors/connector_v2_frame_stacking.py
@@ -87,7 +87,7 @@ def _make_learner_connector(input_observation_space, input_action_space):
                 EpisodicLifeEnv(  # each life is one episode
                     MaxAndSkipEnv(  # frameskip=4 and take max over these 4 frames
                         NoopResetEnv(  # perform n noops after a reset
-                            # partial(FrameStack, k=4)(  # <- no env-based framestacking!
+                            # partial(FrameStack, k=4)(  # <- no env-based framestacking
                             NormalizedImageEnv(
                                 partial(WarpFrame, dim=64)(  # grayscale + resize
                                     partial(

From 49585974b9061691eda5c1ca2eefe7c2e5ada582 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 18 Dec 2023 18:36:46 +0100
Subject: [PATCH 10/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_pipeline_v2.py |  12 +-
 rllib/connectors/connector_v2.py          | 130 ++++++++++++++++------
 2 files changed, 101 insertions(+), 41 deletions(-)

diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 893dfcb57b49e..ce04db32a71b7 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -197,7 +197,7 @@ def get_state(self):
         return ConnectorPipelineV2.__name__, children
 
     @override(ConnectorV2)
-    def set_state(self, state: Dict[str, Any]):
+    def set_state(self, state: Dict[str, Any]) -> None:
         connectors = []
         for state in params:
             try:
@@ -265,10 +265,12 @@ def _fix_input_output_types(self):
             self.output_type = self.connectors[-1].output_type
             # TODO (sven): Create some examples for pipelines, in which the spaces
             #  are changed several times by the individual pieces.
-            # self.input_observation_space = self.connectors[0].input_observation_space
-            # self.input_action_space = self.connectors[0].input_action_space
-            # self.observation_space = self.connectors[-1].observation_space
-            # self.action_space = self.connectors[-1].action_space
+            self.input_observation_space = self.connectors[0].input_observation_space
+            self.input_action_space = self.connectors[0].input_action_space
+            self._observation_space = self.connectors[-1].observation_space
+            self._action_space = self.connectors[-1].action_space
         else:
             self.input_type = None
             self.output_type = None
+            self._observation_space = None
+            self._action_space = None
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index 2ce20dd871b6f..c0796d8b94a0f 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -31,24 +31,28 @@ class ConnectorV2(abc.ABC):
     connectors used in Learner pipelines). From this input data, a ConnectorV2 then
     performs a transformation step.
 
-    There are 3 types of pipelines a ConnectorV2 can belong to:
-    1) env-to-module: The connector transforms envrionment data before it gets to the
-    RLModule. This type of pipeline is used by an EnvRunner for transforming
-    env output data to RLModule readable data (for the next RLModule forward pass).
-    2) module-to-env: The connector transforms RLModule outputs before they are sent
-    back to the environment (as actions). This type of pipeline is used by an EnvRunner
-    to transform RLModule output data to env readable actions (for the next
-    `env.step()` call).
-    3) learner pipeline: The connector transforms data coming directly from an
-    environment sampling step or a replay buffer and will be sent into the RLModule's
-    `forward_train()` method afterwards to compute the loss inputs. This type of
-    pipeline is used by a Learner to transform raw training data (a batch or a list of
-    episodes) to RLModule readable training data (for the next RLModule
-    `forward_train()` call).
+    There are 3 types of pipelines any ConnectorV2 piece can belong to:
+    1) EnvToModulePipeline: The connector transforms environment data before it gets to
+    the RLModule. This type of pipeline is used by an EnvRunner for transforming
+    env output data into RLModule readable data (for the next RLModule forward pass).
+    For example, such a pipeline would include observation postprocessors, -filters,
+    or any RNN preparation code related to time-sequences and zero-padding.
+    2) ModuleToEnvPipeline: This type of pipeline is used by an
+    EnvRunner to transform RLModule output data to env readable actions (for the next
+    `env.step()` call). For example, in case the RLModule only outputs action
+    distribution parameters (but not actual actions), the ModuleToEnvPipeline would
+    take care of sampling the actions to be sent back to the end from the
+    resulting distribution (made deterministic if exploration is off).
+    3) LearnerConnectorPipeline: This connector pipeline type transforms data coming
+    from an `EnvRunner.sample()` call or a replay buffer and will then be sent into the
+    RLModule's `forward_train()` method in order to compute loss function inputs.
+    This type of pipeline is used by a Learner worker to transform raw training data
+    (a batch or a list of episodes) to RLModule readable training data (for the next
+    RLModule `forward_train()` call).
 
     Some connectors might be stateful, for example for keeping track of observation
     filtering stats (mean and stddev values). Any Algorithm, which uses connectors is
-    responsible for frequenly synchronizing the states of all connectors and connector
+    responsible for frequently synchronizing the states of all connectors and connector
     pipelines between the EnvRunners (owning the env-to-module and module-to-env
     pipelines) and the Learners (owning the Learner pipelines).
     """
@@ -62,49 +66,83 @@ class ConnectorV2(abc.ABC):
 
     @property
     def observation_space(self):
-        return self.input_observation_space
+        """Getter for our (output) observation space.
+
+        Logic: Use user provided space (if set via `observation_space` setter)
+        otherwise, use the same as the input space, assuming this connector piece
+        does not alter the space.
+        """
+        return self._observation_space or self.input_observation_space
 
     @observation_space.setter
     def observation_space(self, value):
-        self.observation_space = value
+        """Setter for our (output) observation space."""
+        self._observation_space = value
 
     @property
     def action_space(self):
-        return self.input_action_space
+        """Getter for our (output) action space.
+
+        Logic: Use user provided space (if set via `action_space` setter)
+        otherwise, use the same as the input space, assuming this connector piece
+        does not alter the space.
+        """
+        return self._action_space or self.input_action_space
 
     @action_space.setter
     def action_space(self, value):
-        self.action_space = value
+        """Setter for our (output) action space."""
+        self._action_space = value
 
     def __init__(
         self,
         *,
-        input_observation_space: gym.Space,
-        input_action_space: gym.Space,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
         env: Optional[gym.Env] = None,
         **kwargs,
     ):
         """Initializes a ConnectorV2 instance.
 
         Args:
-            input_observation_space: The (mandatory) input observation space. This
+            input_observation_space: An optional input observation space. This
                 is the space coming from a previous connector piece in the
                 (env-to-module or learner) pipeline or it is directly defined within
-                the used gym.Env.
-            input_action_space: The (mandatory) input action space. This
+                the used gym.Env. If None, `env` must be provided.
+            input_action_space: An optional input action space. This
                 is the space coming from a previous connector piece in the
                 (module-to-env) pipeline or it is directly defined within the used
-                gym.Env.
+                gym.Env. If None, `env` must be provided.
             env: An optional env object that the connector might need to know about.
                 Note that normally, env-to-module and module-to-env connectors get this
                 information at construction time, but learner connectors won't (b/c
                 Learner objects don't carry an environment object).
             **kwargs: Forward API-compatibility kwargs.
         """
+        # Infer spaces from `env` argument if spaces are not explicitly provided.
+        if input_observation_space is None or input_action_space is None:
+            if env is None:
+                raise ValueError(
+                    "`env` argument must be provided if `input_observation_space` or "
+                    "`input_action_space` are None!"
+                )
+        if input_observation_space is None:
+            input_observation_space = (
+                env.single_observation_space if isinstance(env, gym.vector.Env)
+                else env.observation_space
+            )
+        if input_action_space is None:
+            input_action_space = (
+                env.single_action_space if isinstance(env, gym.vector.Env)
+                else env.action_space
+            )
         self.input_observation_space = input_observation_space
         self.input_action_space = input_action_space
         self.env = env
 
+        self._observation_space = None
+        self._action_space = None
+
     @abc.abstractmethod
     def __call__(
         self,
@@ -140,25 +178,45 @@ def __call__(
             The transformed connector output abiding to `self.output_type`.
         """
 
-    def __str__(self, indentation: int = 0):
-        return " " * indentation + self.__class__.__name__
-
     def get_state(self) -> Dict[str, Any]:
-        """Returns the current state of this ConnectorV2.
-
-        Used for checkpointing (connectors may be stateful) as well as synchronization
-        between connectors that are run on the (distributed) EnvRunners vs those that
-        run on the (distributed) Learners.
+        """Returns the current state of this ConnectorV2 as a state dict.
 
         Returns:
-            A dict mapping str keys to state information.
+            A state dict mapping any string keys to their (state-defining) values.
         """
         return {}
 
     def set_state(self, state: Dict[str, Any]) -> None:
-        """Sets the state of this connector to the provided one.
+        """Sets the state of this ConnectorV2 to the given value.
 
         Args:
-            state: The new state to set this connector to.
+            state: The state dict to define this ConnectorV2's new state.
+        """
+        pass
+
+    def reset_state(self) -> None:
+        """Resets the state of this ConnectorV2 to some initial value.
+
+        Note that this may NOT be the exact state that this ConnectorV2 was originally
+        constructed with.
         """
         pass
+
+    @staticmethod
+    def merge_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Computes a resulting state given a list of other state dicts.
+
+        Algorithms should use this method for synchronizing states between connectors
+        running on workers (of the same type, e.g. EnvRunner workers).
+
+        Args:
+            states: The list of n other ConnectorV2 states to merge into a single
+                resulting state.
+
+        Returns:
+            The resulting state dict.
+        """
+        return {}
+
+    def __str__(self, indentation: int = 0):
+        return " " * indentation + self.__class__.__name__

From bdf803d47b7ff547753754d71786e603734e6db1 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 19 Dec 2023 21:20:21 +0100
Subject: [PATCH 11/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm.py                 |   3 +-
 rllib/algorithms/algorithm_config.py          |  77 ++++++----
 .../connectors/{utils => common}/__init__.py  |   0
 rllib/connectors/common/frame_stacking.py     | 136 ++++++++++++++++++
 rllib/connectors/connector_v2.py              |  45 ++----
 rllib/connectors/env_to_module/__init__.py    |   4 +
 .../env_to_module/default_env_to_module.py    |   2 +-
 .../env_to_module/env_to_module_pipeline.py   |  39 +----
 .../env_to_module/frame_stacking.py           | 112 +--------------
 .../env_to_module/prev_action_prev_reward.py  |   2 -
 rllib/connectors/learner/__init__.py          |  11 ++
 rllib/connectors/learner/frame_stacking.py    |   2 +-
 .../learner/learner_connector_pipeline.py     |  42 +-----
 rllib/connectors/module_to_env/__init__.py    |   9 ++
 .../module_to_env/default_module_to_env.py    |  68 +++++++--
 .../module_to_env/module_to_env_pipeline.py   |  42 +-----
 .../connectors/connector_v2_frame_stacking.py |  21 ++-
 17 files changed, 306 insertions(+), 309 deletions(-)
 rename rllib/connectors/{utils => common}/__init__.py (100%)
 create mode 100644 rllib/connectors/common/frame_stacking.py

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 170675e0e3956..2d68b89e53ba6 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -751,7 +751,8 @@ def setup(self, config: AlgorithmConfig) -> None:
             )
 
             # Only when using RolloutWorkers: Update also the worker set's
-            # `should_module_be_updated_fn` (analogous to is_policy_to_train).
+            # `is_policy_to_train` (analogous to LearnerGroup's
+            # `should_module_be_updated_fn`).
             # Note that with the new EnvRunner API in combination with the new stack,
             # this information only needs to be kept in the LearnerGroup and not on the
             # EnvRunners anymore.
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 88764c82204df..8b2620241198f 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -1158,90 +1158,119 @@ class directly. Note that this arg can also be specified via
         )
 
     def build_env_to_module_connector(self, env):
-        custom_connectors = []
+        from ray.rllib.connectors.env_to_module import (
+            EnvToModulePipeline,
+            DefaultEnvToModule,
+        )
 
+        custom_connectors = []
         # Create an env-to-module connector pipeline (including RLlib's default
         # env->module connector piece) and return it.
         if self._env_to_module_connector is not None:
             val_ = self._env_to_module_connector(env)
 
             from ray.rllib.connectors.connector_v2 import ConnectorV2
-            from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 
             if isinstance(val_, ConnectorV2) and not isinstance(
-                val_, ConnectorPipelineV2
+                val_, EnvToModulePipeline
             ):
                 custom_connectors = [val_]
+            elif isinstance(val_, (list, tuple)):
+                custom_connectors = list(val_)
             else:
                 return val_
 
-        from ray.rllib.connectors.env_to_module.env_to_module_pipeline import (
-            EnvToModulePipeline,
-        )
-
-        return EnvToModulePipeline(
+        pipeline = EnvToModulePipeline(
             connectors=custom_connectors,
             input_observation_space=env.single_observation_space,
             input_action_space=env.single_action_space,
             env=env,
         )
+        pipeline.append(
+            DefaultEnvToModule(
+                input_observation_space=pipeline.observation_space,
+                input_action_space=pipeline.action_space,
+                env=env,
+            )
+        )
+        return pipeline
 
     def build_module_to_env_connector(self, env):
-        custom_connectors = []
 
+        from ray.rllib.connectors.module_to_env import (
+            DefaultModuleToEnv,
+            ModuleToEnvPipeline,
+        )
+
+        custom_connectors = []
         # Create a module-to-env connector pipeline (including RLlib's default
         # module->env connector piece) and return it.
         if self._module_to_env_connector is not None:
             val_ = self._module_to_env_connector(env)
 
             from ray.rllib.connectors.connector_v2 import ConnectorV2
-            from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 
             if isinstance(val_, ConnectorV2) and not isinstance(
-                val_, ConnectorPipelineV2
+                val_, ModuleToEnvPipeline
             ):
                 custom_connectors = [val_]
+            elif isinstance(val_, (list, tuple)):
+                custom_connectors = list(val_)
             else:
                 return val_
 
-        from ray.rllib.connectors.module_to_env.module_to_env_pipeline import (
-            ModuleToEnvPipeline,
-        )
-
-        return ModuleToEnvPipeline(
+        pipeline = ModuleToEnvPipeline(
             connectors=custom_connectors,
             input_observation_space=env.single_observation_space,
             input_action_space=env.single_action_space,
             env=env,
         )
+        pipeline.append(
+            DefaultModuleToEnv(
+                input_observation_space=pipeline.observation_space,
+                input_action_space=pipeline.action_space,
+                env=env,
+                normalize_actions=self.normalize_actions,
+                clip_actions=self.clip_actions,
+            )
+        )
+        return pipeline
 
     def build_learner_connector(self, input_observation_space, input_action_space):
-        custom_connectors = []
+        from ray.rllib.connectors.learner import (
+            DefaultLearnerConnector,
+            LearnerConnectorPipeline,
+        )
 
+        custom_connectors = []
         # Create a learner connector pipeline (including RLlib's default
         # learner connector piece) and return it.
         if self._learner_connector is not None:
             val_ = self._learner_connector(input_observation_space, input_action_space)
 
             from ray.rllib.connectors.connector_v2 import ConnectorV2
-            from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 
             if isinstance(val_, ConnectorV2) and not isinstance(
-                val_, ConnectorPipelineV2
+                val_, LearnerConnectorPipeline
             ):
                 custom_connectors = [val_]
+            elif isinstance(val_, (list, tuple)):
+                custom_connectors = list(val_)
             else:
                 return val_
 
-        from ray.rllib.connectors.learner.learner_connector_pipeline import (
-            LearnerConnectorPipeline,
-        )
-
-        return LearnerConnectorPipeline(
+        pipeline = LearnerConnectorPipeline(
             connectors=custom_connectors,
             input_observation_space=input_observation_space,
             input_action_space=input_action_space,
         )
+        pipeline.append(
+            DefaultLearnerConnector(
+                input_observation_space=pipeline.observation_space,
+                input_action_space=pipeline.action_space,
+            )
+        )
+        return pipeline
 
     def build_learner_group(
         self,
diff --git a/rllib/connectors/utils/__init__.py b/rllib/connectors/common/__init__.py
similarity index 100%
rename from rllib/connectors/utils/__init__.py
rename to rllib/connectors/common/__init__.py
diff --git a/rllib/connectors/common/frame_stacking.py b/rllib/connectors/common/frame_stacking.py
new file mode 100644
index 0000000000000..2f587ee083f1e
--- /dev/null
+++ b/rllib/connectors/common/frame_stacking.py
@@ -0,0 +1,136 @@
+import numpy as np
+from typing import Any, List, Optional
+
+import gymnasium as gym
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.spaces.space_utils import batch
+from ray.rllib.utils.typing import EpisodeType
+
+
+class _FrameStackingConnector(ConnectorV2):
+    """A connector piece that stacks the previous n observations into one."""
+
+    def __init__(
+        self,
+        *,
+        # Base class constructor args.
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+        # Specific framestacking args.
+        num_frames: int = 1,
+        as_learner_connector: bool = False,
+        **kwargs,
+    ):
+        """Initializes a _FrameStackingConnector instance.
+
+        Args:
+            num_frames: The number of observation frames to stack up (into a single
+                observation) for the RLModule's forward pass.
+            as_preprocessor: Whether this connector should simply postprocess the
+                received observations from the env and store these directly in the
+                episode object. In this mode, the connector can only be used in
+                an `EnvToModulePipeline` and it will act as a classic
+                RLlib framestacking postprocessor.
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to an env-to-module pipeline.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+
+        self.num_frames = num_frames
+        self.as_learner_connector = as_learner_connector
+
+        # Some assumptions: Space is box AND last dim (the stacking one) is 1.
+        assert isinstance(self.observation_space, gym.spaces.Box)
+        assert self.observation_space.shape[-1] == 1
+
+        # Change our observation space according to the given stacking settings.
+        self.observation_space = gym.spaces.Box(
+            low=np.repeat(self.observation_space.low, repeats=self.num_frames, axis=-1),
+            high=np.repeat(
+                self.observation_space.high, repeats=self.num_frames, axis=-1
+            ),
+            shape=list(self.observation_space.shape)[:-1] + [self.num_frames],
+            dtype=self.observation_space.dtype,
+        )
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        input_: Optional[Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        persistent_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # This is a data-in-data-out connector, so we expect `input_` to be a dict
+        # with: key=column name, e.g. "obs" and value=[data to be processed by
+        # RLModule]. We will add to `input_` the last n observations.
+        observations = []
+
+        # Learner connector pipeline. Episodes have been finalized/numpy'ized.
+        if self.as_learner_connector:
+            for episode in episodes:
+
+                def _map_fn(s):
+                    # Squeeze out last dim.
+                    s = np.squeeze(s, axis=-1)
+                    # Calculate new shape and strides
+                    new_shape = (len(episode), self.num_frames) + s.shape[1:]
+                    new_strides = (s.strides[0],) + s.strides
+                    # Create a strided view of the array.
+                    return np.lib.stride_tricks.as_strided(
+                        s, shape=new_shape, strides=new_strides
+                    )
+
+                # Get all observations from the episode in one np array (except for
+                # the very last one, which is the final observation not needed for
+                # learning).
+                observations.append(
+                    tree.map_structure(
+                        _map_fn,
+                        episode.get_observations(
+                            indices=slice(-self.num_frames + 1, len(episode)),
+                            neg_indices_left_of_zero=True,
+                            fill=0.0,
+                        ),
+                    )
+                )
+
+            # Move stack-dimension to the end and concatenate along batch axis.
+            input_[SampleBatch.OBS] = tree.map_structure(
+                lambda *s: np.transpose(np.concatenate(s, axis=0), axes=[0, 2, 3, 1]),
+                *observations,
+            )
+
+        # Env-to-module pipeline. Episodes still operate on lists.
+        else:
+            for episode in episodes:
+                assert not episode.is_finalized
+                # Get the list of observations to stack.
+                obs_stack = episode.get_observations(
+                    indices=slice(-self.num_frames, None),
+                    fill=0.0,
+                )
+                # Observation components are (w, h, 1)
+                # -> stack to (w, h, [num_frames], 1), then squeeze out last dim to get
+                # (w, h, [num_frames]).
+                stacked_obs = tree.map_structure(
+                    lambda *s: np.squeeze(np.stack(s, axis=2), axis=-1),
+                    *obs_stack,
+                )
+                observations.append(stacked_obs)
+
+            input_[SampleBatch.OBS] = batch(observations)
+
+        return input_
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index c0796d8b94a0f..fae1223a83609 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -97,48 +97,25 @@ def action_space(self, value):
     def __init__(
         self,
         *,
-        input_observation_space: Optional[gym.Space] = None,
-        input_action_space: Optional[gym.Space] = None,
-        env: Optional[gym.Env] = None,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
         **kwargs,
     ):
         """Initializes a ConnectorV2 instance.
 
         Args:
-            input_observation_space: An optional input observation space. This
-                is the space coming from a previous connector piece in the
+            input_observation_space: The input observation space for this connector
+                piece. This is the space coming from a previous connector piece in the
                 (env-to-module or learner) pipeline or it is directly defined within
-                the used gym.Env. If None, `env` must be provided.
-            input_action_space: An optional input action space. This
+                the used gym.Env.
+            input_action_space: The input action space for this connector piece. This
                 is the space coming from a previous connector piece in the
                 (module-to-env) pipeline or it is directly defined within the used
-                gym.Env. If None, `env` must be provided.
-            env: An optional env object that the connector might need to know about.
-                Note that normally, env-to-module and module-to-env connectors get this
-                information at construction time, but learner connectors won't (b/c
-                Learner objects don't carry an environment object).
+                gym.Env.
             **kwargs: Forward API-compatibility kwargs.
         """
-        # Infer spaces from `env` argument if spaces are not explicitly provided.
-        if input_observation_space is None or input_action_space is None:
-            if env is None:
-                raise ValueError(
-                    "`env` argument must be provided if `input_observation_space` or "
-                    "`input_action_space` are None!"
-                )
-        if input_observation_space is None:
-            input_observation_space = (
-                env.single_observation_space if isinstance(env, gym.vector.Env)
-                else env.observation_space
-            )
-        if input_action_space is None:
-            input_action_space = (
-                env.single_action_space if isinstance(env, gym.vector.Env)
-                else env.action_space
-            )
         self.input_observation_space = input_observation_space
         self.input_action_space = input_action_space
-        self.env = env
 
         self._observation_space = None
         self._action_space = None
@@ -157,16 +134,16 @@ def __call__(
         """Method for transforming input data into output data.
 
         Args:
+            rl_module: An optional RLModule object that the connector might need to know
+                about. Note that normally, only module-to-env connectors get this
+                information at construction time, but env-to-module and learner
+                connectors won't (b/c they get constructed before the RLModule).
             input_: The input data abiding to `self.input_type` to be transformed by
                 this connector. Transformations might either be done in-place or a new
                 structure may be returned that matches `self.output_type`.
             episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects,
                 each corresponding to one slot in the vector env. Note that episodes
                 should always be considered read-only and not be altered.
-            rl_module: An optional RLModule object that the connector might need to know
-                about. Note that normally, only module-to-env connectors get this
-                information at construction time, but env-to-module and learner
-                connectors won't (b/c they get constructed before the RLModule).
             explore: Whether `explore` is currently on. Per convention, if True, the
                 RLModule's `forward_exploration` method should be called, if False, the
                 EnvRunner should call `forward_inference` instead.
diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py
index b86c2f9cb002f..c156044aa9213 100644
--- a/rllib/connectors/env_to_module/__init__.py
+++ b/rllib/connectors/env_to_module/__init__.py
@@ -1,5 +1,9 @@
 from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
+from ray.rllib.connectors.env_to_module.env_to_module_pipeline import (
+    EnvToModulePipeline,
+)
 
 __all__ = [
     "DefaultEnvToModule",
+    "EnvToModulePipeline",
 ]
diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py
index 8239b5f2c2ebd..f4be1c57c1412 100644
--- a/rllib/connectors/env_to_module/default_env_to_module.py
+++ b/rllib/connectors/env_to_module/default_env_to_module.py
@@ -46,7 +46,7 @@ def __call__(
             # Collect all most-recent observations from given episodes.
             observations = []
             for episode in episodes:
-                observations.append(episode.get_observation(indices=-1))
+                observations.append(episode.get_observations(indices=-1))
             # Batch all collected observations together.
             input_[SampleBatch.OBS] = batch(observations)
 
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index b0f1027799a9f..a3694492f89ba 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -1,49 +1,14 @@
 from typing import Any, List, Optional
 
-import gymnasium as gym
-
-from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
 from ray.rllib.core.rl_module.rl_module import RLModule
-from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
 
 
+@PublicAPI(stability="alpha")
 class EnvToModulePipeline(ConnectorPipelineV2):
-    def __init__(
-        self,
-        *,
-        connectors: Optional[List[ConnectorV2]] = None,
-        input_observation_space: Optional[gym.Space],
-        input_action_space: Optional[gym.Space],
-        env: Optional[gym.Env] = None,
-        rl_module: Optional["RLModule"] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            connectors=connectors,
-            input_observation_space=input_observation_space,
-            input_action_space=input_action_space,
-            env=env,
-            rl_module=rl_module,
-            **kwargs,
-        )
-        # Add the default final connector piece for env-to-module pipelines:
-        # Extracting last obs from episodes and add them to input, iff this has not
-        # happened in any connector piece in this pipeline before.
-        if (
-            len(self.connectors) == 0
-            or type(self.connectors[-1]) is not DefaultEnvToModule
-        ):
-            self.append(
-                DefaultEnvToModule(
-                    input_observation_space=self.observation_space,
-                    input_action_space=self.action_space,
-                    env=env,
-                )
-            )
-
     @override(ConnectorPipelineV2)
     def __call__(
         self,
diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py
index c6ac262da0ae7..b05385b6c10e2 100644
--- a/rllib/connectors/env_to_module/frame_stacking.py
+++ b/rllib/connectors/env_to_module/frame_stacking.py
@@ -1,116 +1,6 @@
 from functools import partial
-import numpy as np
-from typing import Any, List, Optional
 
-import gymnasium as gym
-
-from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.core.rl_module.rl_module import RLModule
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.spaces.space_utils import batch
-from ray.rllib.utils.typing import EpisodeType
-
-
-class _FrameStackingConnector(ConnectorV2):
-    """A connector piece that stacks the previous n observations into one."""
-
-    def __init__(
-        self,
-        *,
-        # Base class constructor args.
-        input_observation_space: gym.Space,
-        input_action_space: gym.Space,
-        env: Optional[gym.Env] = None,
-        # Specific framestacking args.
-        num_frames: int = 1,
-        as_learner_connector: bool = False,
-        **kwargs,
-    ):
-        """Initializes a _FrameStackingConnector instance.
-
-        Args:
-            num_frames: The number of observation frames to stack up (into a single
-                observation) for the RLModule's forward pass.
-            as_learner_connector: Whether this connector is part of a Learner connector
-                pipeline, as opposed to a env-to-module pipeline.
-        """
-        super().__init__(
-            input_observation_space=input_observation_space,
-            input_action_space=input_action_space,
-            env=env,
-            **kwargs,
-        )
-
-        self.num_frames = num_frames
-        self.as_learner_connector = as_learner_connector
-
-        # Some assumptions: Space is box AND last dim (the stacking one) is 1.
-        assert isinstance(self.observation_space, gym.spaces.Box)
-        assert self.observation_space.shape[-1] == 1
-
-        # Change our observation space according to the given stacking settings.
-        self.observation_space = gym.spaces.Box(
-            low=np.repeat(self.observation_space.low, repeats=self.num_frames, axis=-1),
-            high=np.repeat(
-                self.observation_space.high, repeats=self.num_frames, axis=-1
-            ),
-            shape=list(self.observation_space.shape)[:-1] + [self.num_frames],
-            dtype=self.observation_space.dtype,
-        )
-
-    @override(ConnectorV2)
-    def __call__(
-        self,
-        *,
-        rl_module: RLModule,
-        input_: Optional[Any],
-        episodes: List[EpisodeType],
-        explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
-        **kwargs,
-    ) -> Any:
-        # This is a data-in-data-out connector, so we expect `input_` to be a dict
-        # with: key=column name, e.g. "obs" and value=[data to be processed by
-        # RLModule]. We will add to `input_` the last n observations.
-
-        obs = []
-        for episode in episodes:
-
-            # Learner connector pipeline. Episodes have been finalized/numpy'ized.
-            if self.as_learner_connector:
-                # Loop through each timestep in the episode and add the previous n
-                # observations (based on that timestep) to the batch.
-                for ts in range(len(episode)):
-                    obs.append(
-                        episode.get_observations(
-                            # Extract n observations from `ts` to `ts - n`
-                            # (excluding `ts - n`).
-                            indices=slice(ts - self.num_frames + 1, ts + 1),
-                            # Make sure negative indices are NOT interpreted as
-                            # "counting from the end", but as absolute indices meaning
-                            # they refer to timesteps before 0 (which is the lookback
-                            # buffer).
-                            neg_indices_left_of_zero=True,
-                            # In case we are at the very beginning of the episode, e.g.
-                            # ts==0, fill the left side with zero-observations.
-                            fill=0.0,
-                        )
-                    )
-            # Env-to-module pipeline. Episodes still operate on lists.
-            else:
-                assert not episode.is_finalized
-                obs.append(
-                    batch(
-                        episode.get_observations(
-                            indices=slice(-self.num_frames + 1, None),
-                            fill=0.0,
-                        )
-                    )
-                )
-
-        input_[SampleBatch.OBS] = batch(obs)
-        return input_
+from ray.rllib.connectors.common.frame_stacking import _FrameStackingConnector
 
 
 FrameStackingEnvToModule = partial(_FrameStackingConnector, as_learner_connector=False)
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
index 7f0caea909e29..0f66d2c8ade50 100644
--- a/rllib/connectors/env_to_module/prev_action_prev_reward.py
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -21,7 +21,6 @@ def __init__(
         # Base class constructor args.
         input_observation_space: gym.Space,
         input_action_space: gym.Space,
-        env: Optional[gym.Env] = None,
         # Specific prev. r/a args.
         n_prev_actions: int = 1,
         n_prev_rewards: int = 1,
@@ -42,7 +41,6 @@ def __init__(
         super().__init__(
             input_observation_space=input_observation_space,
             input_action_space=input_action_space,
-            env=env,
             **kwargs,
         )
 
diff --git a/rllib/connectors/learner/__init__.py b/rllib/connectors/learner/__init__.py
index e69de29bb2d1d..dda5851866ebc 100644
--- a/rllib/connectors/learner/__init__.py
+++ b/rllib/connectors/learner/__init__.py
@@ -0,0 +1,11 @@
+from ray.rllib.connectors.learner.default_learner_connector import (
+    DefaultLearnerConnector,
+)
+from ray.rllib.connectors.learner.learner_connector_pipeline import (
+    LearnerConnectorPipeline,
+)
+
+__all__ = [
+    "DefaultLearnerConnector",
+    "LearnerConnectorPipeline",
+]
diff --git a/rllib/connectors/learner/frame_stacking.py b/rllib/connectors/learner/frame_stacking.py
index f53a62bd6a726..9b4a9f53ad613 100644
--- a/rllib/connectors/learner/frame_stacking.py
+++ b/rllib/connectors/learner/frame_stacking.py
@@ -1,6 +1,6 @@
 from functools import partial
 
-from ray.rllib.connectors.env_to_module.frame_stacking import _FrameStackingConnector
+from ray.rllib.connectors.common.frame_stacking import _FrameStackingConnector
 
 
 FrameStackingLearner = partial(_FrameStackingConnector, as_learner_connector=True)
diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py
index 88a1ad49c02d1..225b5a4436e06 100644
--- a/rllib/connectors/learner/learner_connector_pipeline.py
+++ b/rllib/connectors/learner/learner_connector_pipeline.py
@@ -1,45 +1,5 @@
-from typing import List, Optional
-
-import gymnasium as gym
-
-from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
-from ray.rllib.connectors.learner.default_learner_connector import (
-    DefaultLearnerConnector,
-)
-from ray.rllib.core.rl_module.rl_module import RLModule
 
 
 class LearnerConnectorPipeline(ConnectorPipelineV2):
-    def __init__(
-        self,
-        *,
-        connectors: Optional[List[ConnectorV2]] = None,
-        input_observation_space: Optional[gym.Space],
-        input_action_space: Optional[gym.Space],
-        env: Optional[gym.Env] = None,
-        rl_module: Optional[RLModule] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            connectors=connectors,
-            input_observation_space=input_observation_space,
-            input_action_space=input_action_space,
-            env=env,
-            rl_module=rl_module,
-            **kwargs,
-        )
-
-        # Add the default final connector piece for learner pipelines:
-        # Making sure that we have - at the minimum - observations and that the data
-        # is time-ranked (if we have a stateful model) and properly zero-padded.
-        if (
-            len(self.connectors) == 0
-            or type(self.connectors[-1]) is not DefaultLearnerConnector
-        ):
-            self.append(
-                DefaultLearnerConnector(
-                    input_observation_space=self.observation_space,
-                    input_action_space=self.action_space,
-                )
-            )
+    pass
diff --git a/rllib/connectors/module_to_env/__init__.py b/rllib/connectors/module_to_env/__init__.py
index e69de29bb2d1d..b7ada36aebdbf 100644
--- a/rllib/connectors/module_to_env/__init__.py
+++ b/rllib/connectors/module_to_env/__init__.py
@@ -0,0 +1,9 @@
+from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
+from ray.rllib.connectors.module_to_env.module_to_env_pipeline import (
+    ModuleToEnvPipeline,
+)
+
+__all__ = [
+    "DefaultModuleToEnv",
+    "ModuleToEnvPipeline",
+]
diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py
index 395225f5d6a64..f27aba4999434 100644
--- a/rllib/connectors/module_to_env/default_module_to_env.py
+++ b/rllib/connectors/module_to_env/default_module_to_env.py
@@ -8,6 +8,12 @@
 from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.spaces.space_utils import (
+    clip_action,
+    get_base_struct_from_space,
+    unsquash_action,
+)
 from ray.rllib.utils.typing import EpisodeType
 from ray.util.annotations import PublicAPI
 
@@ -37,6 +43,41 @@ class DefaultModuleToEnv(ConnectorV2):
         in the input).
     """
 
+    def __init__(
+        self,
+        *,
+        normalize_actions: bool,
+        clip_actions: bool,
+        **kwargs,
+    ):
+        """Initializes a DefaultModuleToEnv (connector piece) instance.
+
+        Args:
+            normalize_actions: If True, actions coming from the RLModule's distribution
+                (or are directly computed by the RLModule w/o sampling) will
+                be assumed 0.0 centered with a small stddev (only affecting Box
+                components) and thus be unsquashed (and clipped, just in case) to the
+                bounds of the env's action space. For example, if the action space of
+                the environment is `Box(-2.0, -0.5, (1,))`, the model outputs
+                mean and stddev as 0.1 and exp(0.2), and we sample an action of 0.9
+                from the resulting distribution, then this 0.9 will be unsquashed into
+                the [-2.0 -0.5] interval. If - after unsquashing - the action still
+                breaches the action space, it will simply be clipped.
+            clip_actions: If True, actions coming from the RLModule's distribution
+                (or are directly computed by the RLModule w/o sampling) will be clipped
+                such that they fit into the env's action space's bounds.
+                For example, if the action space of the environment is
+                `Box(-0.5, 0.5, (1,))`, the model outputs
+                mean and stddev as 0.1 and exp(0.2), and we sample an action of 0.9
+                from the resulting distribution, then this 0.9 will be clipped to 0.5
+                to fit into the [-0.5 0.5] interval.
+        """
+        super().__init__(**kwargs)
+
+        self._action_space_struct = get_base_struct_from_space(self.action_space)
+        self.normalize_actions = normalize_actions
+        self.clip_actions = clip_actions
+
     @override(ConnectorV2)
     def __call__(
         self,
@@ -90,20 +131,27 @@ def __call__(
                     f"the '{SampleBatch.ACTION_DIST_INPUTS}' key in it (or both)!"
                 )
             actions = action_dist.sample()
-            input_[SampleBatch.ACTIONS] = actions
 
         # For convenience and if possible, compute action logp from distribution
         # and add to output.
         if action_dist is not None and SampleBatch.ACTION_LOGP not in input_:
-            input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions)
+            input_[SampleBatch.ACTION_LOGP] = convert_to_numpy(
+                action_dist.logp(actions)
+            )
 
-        return input_
+        actions = convert_to_numpy(actions)
+
+        # Process actions according to Env's action space bounds, if necessary.
+        # Normalize actions.
+        if self.normalize_actions:
+            actions = unsquash_action(actions, self._action_space_struct)
+        # Clip actions.
+        elif self.clip_actions:
+            actions = clip_action(actions, self._action_space_struct)
 
-    # @override(Connector)
-    # def serialize(self):
-    #    return ClipActions.__name__, None
+        input_[SampleBatch.ACTIONS] = actions
 
-    # @staticmethod
-    # TODO
-    # def from_state(ctx: ConnectorContext, params: Any):
-    #    return ClipActions(ctx)
+        # Convert everything into numpy.
+        input_ = convert_to_numpy(input_)
+
+        return input_
diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py
index 2abcecf439d57..e0a11fdac4a63 100644
--- a/rllib/connectors/module_to_env/module_to_env_pipeline.py
+++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py
@@ -1,45 +1,5 @@
-from typing import List, Optional
-
-import gymnasium as gym
-
-from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
-from ray.rllib.core.rl_module.rl_module import RLModule
-from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv
 
 
 class ModuleToEnvPipeline(ConnectorPipelineV2):
-    def __init__(
-        self,
-        *,
-        connectors: Optional[List[ConnectorV2]] = None,
-        input_observation_space: Optional[gym.Space],
-        input_action_space: Optional[gym.Space],
-        env: Optional[gym.Env] = None,
-        rl_module: Optional[RLModule] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            connectors=connectors,
-            input_observation_space=input_observation_space,
-            input_action_space=input_action_space,
-            env=env,
-            rl_module=rl_module,
-            **kwargs,
-        )
-
-        # Add the default final connector piece for env-to-module pipelines:
-        # Sampling actions from action_dist_inputs and add them to input, iff this has
-        # not happened in any connector piece in this pipeline before.
-        if (
-            len(self.connectors) == 0
-            or type(self.connectors[-1]) is not DefaultModuleToEnv
-        ):
-            self.append(
-                DefaultModuleToEnv(
-                    input_observation_space=self.observation_space,
-                    input_action_space=self.action_space,
-                    env=env,
-                    rl_module=rl_module,
-                )
-            )
+    pass
diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py
index ab45623a5562a..9f5f2fb395fb0 100644
--- a/rllib/examples/connectors/connector_v2_frame_stacking.py
+++ b/rllib/examples/connectors/connector_v2_frame_stacking.py
@@ -1,6 +1,5 @@
 import argparse
 from functools import partial
-import os
 
 import gymnasium as gym
 
@@ -27,6 +26,12 @@
     default="torch",
     help="The DL framework specifier.",
 )
+parser.add_argument(
+    "--num-gpus",
+    type=int,
+    default=0,
+    help="The number of GPUs (Learner workers) to use.",
+)
 parser.add_argument(
     "--num-frames",
     type=int,
@@ -43,10 +48,10 @@
     "--stop-iters", type=int, default=2000, help="Number of iterations to train."
 )
 parser.add_argument(
-    "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train."
+    "--stop-timesteps", type=int, default=2000000, help="Number of timesteps to train."
 )
 parser.add_argument(
-    "--stop-reward", type=float, default=400.0, help="Reward at which we stop training."
+    "--stop-reward", type=float, default=20.0, help="Reward at which we stop training."
 )
 
 
@@ -66,7 +71,6 @@ def _make_env_to_module_connector(env):
         return FrameStackingEnvToModule(
             input_observation_space=env.single_observation_space,
             input_action_space=env.single_action_space,
-            env=env,
             num_frames=args.num_frames,
         )
 
@@ -127,7 +131,11 @@ def _make_learner_connector(input_observation_space, input_action_space):
             env_runner_cls=SingleAgentEnvRunner,
             env_to_module_connector=_make_env_to_module_connector,
         )
-        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+        .resources(
+            num_learner_workers=args.num_gpus,
+            num_gpus_per_learner_worker=1 if args.num_gpus else 0,
+            num_cpus_for_local_worker=1,
+        )
         .training(
             # Use our frame stacking learner connector.
             learner_connector=_make_learner_connector,
@@ -137,7 +145,8 @@ def _make_learner_connector(input_observation_space, input_action_space):
             vf_clip_param=10.0,
             entropy_coeff=0.01,
             num_sgd_iter=10,
-            lr=0.00025,  # needs to be adjusted: `lr=0.00025*num_learner_workers`
+            # Linearly adjust learning rate based on number of GPUs.
+            lr=0.00015 * (args.num_gpus or 1),
             grad_clip=100.0,
             grad_clip_by="global_norm",
             model={

From 2649e70043c53dba847b46977a848fa075fdd097 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 21 Dec 2023 09:00:41 +0100
Subject: [PATCH 12/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/common/frame_stacking.py     | 12 ++---
 rllib/connectors/connector_pipeline_v2.py     |  6 +--
 rllib/connectors/connector_v2.py              |  4 +-
 .../env_to_module/default_env_to_module.py    | 16 +++----
 .../env_to_module/env_to_module_pipeline.py   |  4 +-
 .../env_to_module/prev_action_prev_reward.py  | 12 ++---
 .../learner/default_learner_connector.py      | 45 +++++++++----------
 .../module_to_env/default_module_to_env.py    | 32 +++++++------
 8 files changed, 63 insertions(+), 68 deletions(-)

diff --git a/rllib/connectors/common/frame_stacking.py b/rllib/connectors/common/frame_stacking.py
index 2f587ee083f1e..b139ee21593fa 100644
--- a/rllib/connectors/common/frame_stacking.py
+++ b/rllib/connectors/common/frame_stacking.py
@@ -67,15 +67,15 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Optional[Any],
+        data: Optional[Any],
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
-        # This is a data-in-data-out connector, so we expect `input_` to be a dict
+        # This is a data-in-data-out connector, so we expect `data` to be a dict
         # with: key=column name, e.g. "obs" and value=[data to be processed by
-        # RLModule]. We will add to `input_` the last n observations.
+        # RLModule]. We will add to `data` the last n observations.
         observations = []
 
         # Learner connector pipeline. Episodes have been finalized/numpy'ized.
@@ -108,7 +108,7 @@ def _map_fn(s):
                 )
 
             # Move stack-dimension to the end and concatenate along batch axis.
-            input_[SampleBatch.OBS] = tree.map_structure(
+            data[SampleBatch.OBS] = tree.map_structure(
                 lambda *s: np.transpose(np.concatenate(s, axis=0), axes=[0, 2, 3, 1]),
                 *observations,
             )
@@ -131,6 +131,6 @@ def _map_fn(s):
                 )
                 observations.append(stacked_obs)
 
-            input_[SampleBatch.OBS] = batch(observations)
+            data[SampleBatch.OBS] = batch(observations)
 
-        return input_
+        return data
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index ce04db32a71b7..75f993efba050 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -33,7 +33,7 @@ def __init__(
     def __call__(
         self,
         rl_module: RLModule,
-        input_: Any,
+        data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
@@ -46,13 +46,13 @@ def __call__(
         """
         # Loop through connector pieces and call each one with the output of the
         # previous one. Thereby, time each connector piece's call.
-        ret = input_
+        ret = data
         for connector in self.connectors:
             timer = self.timers[str(connector)]
             with timer:
                 ret = connector(
                     rl_module=rl_module,
-                    input_=ret,
+                    data=ret,
                     episodes=episodes,
                     explore=explore,
                     persistent_data=persistent_data,
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index fae1223a83609..4e6c08d958f59 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -125,7 +125,7 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Any,
+        data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
@@ -138,7 +138,7 @@ def __call__(
                 about. Note that normally, only module-to-env connectors get this
                 information at construction time, but env-to-module and learner
                 connectors won't (b/c they get constructed before the RLModule).
-            input_: The input data abiding to `self.input_type` to be transformed by
+            data: The input data abiding to `self.input_type` to be transformed by
                 this connector. Transformations might either be done in-place or a new
                 structure may be returned that matches `self.output_type`.
             episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects,
diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py
index f4be1c57c1412..1052ffab4d41b 100644
--- a/rllib/connectors/env_to_module/default_env_to_module.py
+++ b/rllib/connectors/env_to_module/default_env_to_module.py
@@ -34,7 +34,7 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Optional[Any] = None,
+        data: Optional[Any] = None,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
@@ -42,17 +42,17 @@ def __call__(
     ) -> Any:
         # If observations cannot be found in `input`, add the most recent ones (from all
         # episodes).
-        if SampleBatch.OBS not in input_:
+        if SampleBatch.OBS not in data:
             # Collect all most-recent observations from given episodes.
             observations = []
             for episode in episodes:
                 observations.append(episode.get_observations(indices=-1))
             # Batch all collected observations together.
-            input_[SampleBatch.OBS] = batch(observations)
+            data[SampleBatch.OBS] = batch(observations)
 
         # If our module is stateful:
-        # - Add the most recent STATE_OUTs to `input_`.
-        # - Make all data in `input_` have a time rank (T=1).
+        # - Add the most recent STATE_OUTs to `data`.
+        # - Make all data in `data` have a time rank (T=1).
         if rl_module.is_stateful():
             # Collect all most recently computed STATE_OUT (or use initial states from
             # RLModule if at beginning of episode).
@@ -71,10 +71,10 @@ def __call__(
                 states.append(state)
 
             # Make all other inputs have an additional T=1 axis.
-            input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_)
+            data = tree.map_structure(lambda s: np.expand_dims(s, axis=1), data)
 
             # Batch states (from list of individual vector sub-env states).
             # Note that state ins should NOT have the extra time dimension.
-            input_[STATE_IN] = batch(states)
+            data[STATE_IN] = batch(states)
 
-        return input_
+        return data
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index a3694492f89ba..b2a39b8ecfc25 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -14,7 +14,7 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Optional[Any] = None,
+        data: Optional[Any] = None,
         episodes: List[EpisodeType],
         explore: bool,
         persistent_data: Optional[dict] = None,
@@ -24,7 +24,7 @@ def __call__(
         # Might just be empty and to be populated from `episodes`.
         return super().__call__(
             rl_module=rl_module,
-            input_=input_ if input_ is not None else {},
+            data=data if data is not None else {},
             episodes=episodes,
             explore=explore,
             persistent_data=persistent_data,
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
index 0f66d2c8ade50..4c890cd3f5133 100644
--- a/rllib/connectors/env_to_module/prev_action_prev_reward.py
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -53,16 +53,16 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Optional[Any],
+        data: Optional[Any],
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
-        # This is a data-in-data-out connector, so we expect `input_` to be a dict
+        # This is a data-in-data-out connector, so we expect `data` to be a dict
         # with: key=column name, e.g. "obs" and value=[data to be processed by
         # RLModule]. We will just extract the most recent rewards and/or most recent
-        # actions from all episodes and store them inside the `input_` data dict.
+        # actions from all episodes and store them inside the `data` data dict.
 
         prev_a = []
         prev_r = []
@@ -122,9 +122,9 @@ def __call__(
                     )
                 )
 
-        input_[SampleBatch.PREV_ACTIONS] = batch(prev_a)
-        input_[SampleBatch.PREV_REWARDS] = np.array(prev_r)
-        return input_
+        data[SampleBatch.PREV_ACTIONS] = batch(prev_a)
+        data[SampleBatch.PREV_REWARDS] = np.array(prev_r)
+        return data
 
 
 PrevRewardPrevActionEnvToModule = partial(
diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py
index 4216f4790b5f3..3d8dd6dd9415d 100644
--- a/rllib/connectors/learner/default_learner_connector.py
+++ b/rllib/connectors/learner/default_learner_connector.py
@@ -32,8 +32,8 @@ class DefaultLearnerConnector(ConnectorV2):
     will be zero-padded, if necessary.
 
     If the user wants to customize their own data under the given keys (e.g. obs,
-    actions, ...), they can extract from the episodes or recompute from `input_`
-    their own data and store it in `input_` under those keys. In this case, the default
+    actions, ...), they can extract from the episodes or recompute from `data`
+    their own data and store it in `data` under those keys. In this case, the default
     connector will not change the data under these keys and simply act as a
     pass-through.
     """
@@ -43,16 +43,16 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Any,
+        data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # If episodes are provided, extract the essential data from them, but only if
-        # respective keys are not present yet in `input_`.
+        # respective keys are not present yet in `data`.
         if not episodes:
-            return input_
+            return data
 
         # Get the data dicts for all episodes.
         data_dicts = [episode.get_data_dict() for episode in episodes]
@@ -60,10 +60,10 @@ def __call__(
         state_in = None
         T = rl_module.config.model_config_dict.get("max_seq_len")
 
-        # RLModule is stateful and STATE_IN is not found in `input_` (user's custom
+        # RLModule is stateful and STATE_IN is not found in `data` (user's custom
         # connectors have not provided this information yet) -> Perform separate
         # handling of STATE_OUT/STATE_IN keys:
-        if rl_module.is_stateful() and STATE_IN not in input_:
+        if rl_module.is_stateful() and STATE_IN not in data:
             if T is None:
                 raise ValueError(
                     "You are using a stateful RLModule and are not providing custom "
@@ -104,11 +104,11 @@ def __call__(
             # Concatenate the individual episodes' STATE_INs.
             state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins)
 
-            # Before adding anything else to the `input_`, add the time axis to existing
+            # Before adding anything else to the `data`, add the time axis to existing
             # data.
-            input_ = tree.map_structure(
+            data = tree.map_structure(
                 lambda s: split_and_pad_single_record(s, episodes, T=T),
-                input_,
+                data,
             )
 
             # Set the reduce function for all the data we might still have to extract
@@ -125,8 +125,8 @@ def __call__(
             # episodes along the batch axis (axis=0).
             reduce_fn = np.concatenate
 
-        # Extract all data from the episodes and add to `input_`, if not already in
-        # `input_`.
+        # Extract all data from the episodes and add to `data`, if not already in
+        # `data`.
         for key in [
             SampleBatch.OBS,
             SampleBatch.ACTIONS,
@@ -136,35 +136,32 @@ def __call__(
             SampleBatch.T,  # TODO: remove (normally not needed in train batch)
             *episodes[0].extra_model_outputs.keys(),
         ]:
-            if key not in input_ and key != STATE_OUT:
+            if key not in data and key != STATE_OUT:
                 # Concatenate everything together (along B-axis=0).
-                input_[key] = tree.map_structure(
+                data[key] = tree.map_structure(
                     lambda *s: reduce_fn(s),
                     *[d[key] for d in data_dicts],
                 )
 
         # Handle infos (always lists, not numpy arrays).
-        if SampleBatch.INFOS not in input_:
-            input_[SampleBatch.INFOS] = sum(
+        if SampleBatch.INFOS not in data:
+            data[SampleBatch.INFOS] = sum(
                 [d[SampleBatch.INFOS] for d in data_dicts],
                 [],
             )
 
         # Now that all "normal" fields are time-dim'd and zero-padded, add
-        # the STATE_IN column to `input_`.
+        # the STATE_IN column to `data`.
         if rl_module.is_stateful():
-            input_[STATE_IN] = state_in
+            data[STATE_IN] = state_in
             # Also, create the loss mask (b/c of our now possibly zero-padded data) as
-            # well as the seq_lens array and add these to `input_` as well.
-            (
-                input_["loss_mask"],
-                input_[SampleBatch.SEQ_LENS],
-            ) = create_mask_and_seq_lens(
+            # well as the seq_lens array and add these to `data` as well.
+            (data["loss_mask"], data[SampleBatch.SEQ_LENS],) = create_mask_and_seq_lens(
                 episode_lens=[len(episode) for episode in episodes],
                 T=T,
             )
 
-        return input_
+        return data
 
 
 def split_and_pad(episodes_data, T):
diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py
index f27aba4999434..449316fb6b967 100644
--- a/rllib/connectors/module_to_env/default_module_to_env.py
+++ b/rllib/connectors/module_to_env/default_module_to_env.py
@@ -25,7 +25,7 @@ class DefaultModuleToEnv(ConnectorV2):
     If necessary, this connector samples actions, given action dist. inputs and a
     dist. class.
     The connector will only sample from the action distribution, if the
-    SampleBatch.ACTIONS key cannot be found in `input_`. Otherwise, it'll behave
+    SampleBatch.ACTIONS key cannot be found in `data`. Otherwise, it'll behave
     as pass through (noop). If SampleBatch.ACTIONS is not present, but
     SampleBatch.ACTION_DIST_INPUTS are, the connector will create a new action
     distribution using the RLModule in the connector context and sample from this
@@ -83,7 +83,7 @@ def __call__(
         self,
         *,
         rl_module: RLModule,
-        input_: Any,
+        data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
         persistent_data: Optional[dict] = None,
@@ -91,27 +91,27 @@ def __call__(
     ) -> Any:
 
         # Loop through all modules that created some output.
-        # for mid in input_.keys():
+        # for mid in data.keys():
         #    sa_module = ctx.rl_module.get_module(module_id=mid)
 
         # If our RLModule is stateful, remove the T=1 axis from all model outputs
         # (except the state outs, which never have this extra time axis).
         if rl_module.is_stateful():
-            state = input_.pop(STATE_OUT, None)
-            input_ = tree.map_structure(lambda s: np.squeeze(s, axis=1), input_)
+            state = data.pop(STATE_OUT, None)
+            data = tree.map_structure(lambda s: np.squeeze(s, axis=1), data)
             if state:
-                input_[STATE_OUT] = state
+                data[STATE_OUT] = state
 
         # ACTION_DIST_INPUTS field returned by `forward_exploration|inference()` ->
         # Create a new action distribution object.
         action_dist = None
-        if SampleBatch.ACTION_DIST_INPUTS in input_:
+        if SampleBatch.ACTION_DIST_INPUTS in data:
             if explore:
                 action_dist_class = rl_module.get_exploration_action_dist_cls()
             else:
                 action_dist_class = rl_module.get_inference_action_dist_cls()
             action_dist = action_dist_class.from_logits(
-                input_[SampleBatch.ACTION_DIST_INPUTS]
+                data[SampleBatch.ACTION_DIST_INPUTS]
             )
 
             # TODO (sven): Should this not already be taken care of by RLModule's
@@ -120,8 +120,8 @@ def __call__(
                 action_dist = action_dist.to_deterministic()
 
         # If `forward_...()` returned actions, use them here as-is.
-        if SampleBatch.ACTIONS in input_:
-            actions = input_[SampleBatch.ACTIONS]
+        if SampleBatch.ACTIONS in data:
+            actions = data[SampleBatch.ACTIONS]
         # Otherwise, sample actions from the distribution.
         else:
             if action_dist is None:
@@ -134,10 +134,8 @@ def __call__(
 
         # For convenience and if possible, compute action logp from distribution
         # and add to output.
-        if action_dist is not None and SampleBatch.ACTION_LOGP not in input_:
-            input_[SampleBatch.ACTION_LOGP] = convert_to_numpy(
-                action_dist.logp(actions)
-            )
+        if action_dist is not None and SampleBatch.ACTION_LOGP not in data:
+            data[SampleBatch.ACTION_LOGP] = convert_to_numpy(action_dist.logp(actions))
 
         actions = convert_to_numpy(actions)
 
@@ -149,9 +147,9 @@ def __call__(
         elif self.clip_actions:
             actions = clip_action(actions, self._action_space_struct)
 
-        input_[SampleBatch.ACTIONS] = actions
+        data[SampleBatch.ACTIONS] = actions
 
         # Convert everything into numpy.
-        input_ = convert_to_numpy(input_)
+        data = convert_to_numpy(data)
 
-        return input_
+        return data

From b58ad312faa1843dd13ca9490024d2fc055752f3 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 21 Dec 2023 09:03:35 +0100
Subject: [PATCH 13/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/common/frame_stacking.py                 | 2 +-
 rllib/connectors/connector_pipeline_v2.py                 | 4 ++--
 rllib/connectors/connector_v2.py                          | 4 ++--
 rllib/connectors/env_to_module/default_env_to_module.py   | 2 +-
 rllib/connectors/env_to_module/env_to_module_pipeline.py  | 4 ++--
 rllib/connectors/env_to_module/prev_action_prev_reward.py | 2 +-
 rllib/connectors/learner/default_learner_connector.py     | 2 +-
 rllib/connectors/module_to_env/default_module_to_env.py   | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/rllib/connectors/common/frame_stacking.py b/rllib/connectors/common/frame_stacking.py
index b139ee21593fa..3b7592b852a35 100644
--- a/rllib/connectors/common/frame_stacking.py
+++ b/rllib/connectors/common/frame_stacking.py
@@ -70,7 +70,7 @@ def __call__(
         data: Optional[Any],
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # This is a data-in-data-out connector, so we expect `data` to be a dict
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 75f993efba050..76e6f952b91f2 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -36,7 +36,7 @@ def __call__(
         data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         """In a pipeline, we simply call each of our connector pieces after each other.
@@ -55,7 +55,7 @@ def __call__(
                     data=ret,
                     episodes=episodes,
                     explore=explore,
-                    persistent_data=persistent_data,
+                    shared_data=shared_data,
                     **kwargs,
                 )
         return ret
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index 4e6c08d958f59..a4bad77b39da5 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -128,7 +128,7 @@ def __call__(
         data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         """Method for transforming input data into output data.
@@ -147,7 +147,7 @@ def __call__(
             explore: Whether `explore` is currently on. Per convention, if True, the
                 RLModule's `forward_exploration` method should be called, if False, the
                 EnvRunner should call `forward_inference` instead.
-            persistent_data: Optional additional context data that needs to be exchanged
+            shared_data: Optional additional context data that needs to be exchanged
                 between different Connector pieces and -pipelines.
             kwargs: Forward API-compatibility kwargs.
 
diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py
index 1052ffab4d41b..9a5813403036f 100644
--- a/rllib/connectors/env_to_module/default_env_to_module.py
+++ b/rllib/connectors/env_to_module/default_env_to_module.py
@@ -37,7 +37,7 @@ def __call__(
         data: Optional[Any] = None,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # If observations cannot be found in `input`, add the most recent ones (from all
diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py
index b2a39b8ecfc25..5f790ec84a769 100644
--- a/rllib/connectors/env_to_module/env_to_module_pipeline.py
+++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py
@@ -17,7 +17,7 @@ def __call__(
         data: Optional[Any] = None,
         episodes: List[EpisodeType],
         explore: bool,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ):
         # Make sure user does not necessarily send initial input into this pipeline.
@@ -27,6 +27,6 @@ def __call__(
             data=data if data is not None else {},
             episodes=episodes,
             explore=explore,
-            persistent_data=persistent_data,
+            shared_data=shared_data,
             **kwargs,
         )
diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py
index 4c890cd3f5133..cae717beee0b1 100644
--- a/rllib/connectors/env_to_module/prev_action_prev_reward.py
+++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py
@@ -56,7 +56,7 @@ def __call__(
         data: Optional[Any],
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # This is a data-in-data-out connector, so we expect `data` to be a dict
diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py
index 3d8dd6dd9415d..6e17beb82f52a 100644
--- a/rllib/connectors/learner/default_learner_connector.py
+++ b/rllib/connectors/learner/default_learner_connector.py
@@ -46,7 +46,7 @@ def __call__(
         data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
         # If episodes are provided, extract the essential data from them, but only if
diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py
index 449316fb6b967..e36b4c4c4771b 100644
--- a/rllib/connectors/module_to_env/default_module_to_env.py
+++ b/rllib/connectors/module_to_env/default_module_to_env.py
@@ -86,7 +86,7 @@ def __call__(
         data: Any,
         episodes: List[EpisodeType],
         explore: Optional[bool] = None,
-        persistent_data: Optional[dict] = None,
+        shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
 

From 7bc0ac63b3086b00f853409157ceb52bce9cafd6 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 21 Dec 2023 12:03:10 +0100
Subject: [PATCH 14/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                         | 13 +++++++------
 rllib/env/wrappers/atari_wrappers.py                |  1 +
 .../connectors/connector_v2_frame_stacking.py       |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 6bbf72f3cc69a..0623e5455815e 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -781,12 +781,13 @@ py_test(
 # Tag: connector_v2
 # --------------------------------------------------------------------
 
-py_test(
-    name = "connectors/tests/test_connector_v2",
-    tags = ["team:rllib", "connector_v2"],
-    size = "small",
-    srcs = ["connectors/tests/test_connector_v2.py"]
-)
+# TODO (sven): Add these tests in a separate PR.
+# py_test(
+#    name = "connectors/tests/test_connector_v2",
+#    tags = ["team:rllib", "connector_v2"],
+#    size = "small",
+#    srcs = ["connectors/tests/test_connector_v2.py"]
+# )
 
 # --------------------------------------------------------------------
 # Env tests
diff --git a/rllib/env/wrappers/atari_wrappers.py b/rllib/env/wrappers/atari_wrappers.py
index 2919685cf6bc5..fb4fa762c819a 100644
--- a/rllib/env/wrappers/atari_wrappers.py
+++ b/rllib/env/wrappers/atari_wrappers.py
@@ -240,6 +240,7 @@ def reset(self, **kwargs):
         return self.env.reset(**kwargs)
 
 
+@PublicAPI
 class NormalizedImageEnv(gym.ObservationWrapper):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py
index 9f5f2fb395fb0..1119c2539bdd3 100644
--- a/rllib/examples/connectors/connector_v2_frame_stacking.py
+++ b/rllib/examples/connectors/connector_v2_frame_stacking.py
@@ -82,7 +82,7 @@ def _make_learner_connector(input_observation_space, input_action_space):
             num_frames=args.num_frames,
         )
 
-    # Create a custom Atari setup (w/o the usual Rllib-hard-coded framestacking in it).
+    # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
     # We would like our frame stacking connector to do this job.
     tune.register_env(
         "env",

From f7dde731627098199d1b21699dd941c3a314368c Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 21 Dec 2023 12:22:07 +0100
Subject: [PATCH 15/15] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm.py                 | 15 ++++-----
 rllib/algorithms/impala/impala.py             |  7 ++--
 rllib/algorithms/pg/pg.py                     |  5 ++-
 rllib/algorithms/ppo/ppo.py                   | 20 ++++++------
 rllib/algorithms/ppo/tf/ppo_tf_rl_module.py   | 20 ++++++------
 .../ppo/torch/ppo_torch_rl_module.py          | 20 +++++++-----
 rllib/connectors/connector_pipeline_v2.py     | 32 +++++++------------
 rllib/core/learner/torch/torch_learner.py     |  1 +
 rllib/core/models/catalog.py                  | 11 ++++---
 rllib/core/models/torch/encoder.py            |  2 +-
 rllib/utils/filter_manager.py                 |  2 +-
 rllib/utils/numpy.py                          | 14 ++------
 rllib/utils/tests/test_minibatch_utils.py     |  8 ++---
 rllib/utils/torch_utils.py                    |  4 +--
 14 files changed, 74 insertions(+), 87 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 2d68b89e53ba6..11ba31c794da3 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -564,6 +564,11 @@ def setup(self, config: AlgorithmConfig) -> None:
             config_obj.env = self._env_id
             self.config = config_obj
 
+        self._uses_new_env_runners = (
+            self.config.env_runner_cls is not None
+            and not issubclass(self.config.env_runner_cls, RolloutWorker)
+        )
+
         # Set Algorithm's seed after we have - if necessary - enabled
         # tf eager-execution.
         update_global_seed_if_necessary(self.config.framework_str, self.config.seed)
@@ -756,9 +761,7 @@ def setup(self, config: AlgorithmConfig) -> None:
             # Note that with the new EnvRunner API in combination with the new stack,
             # this information only needs to be kept in the LearnerGroup and not on the
             # EnvRunners anymore.
-            if self.config.env_runner_cls is None or issubclass(
-                self.config.env_runner_cls, RolloutWorker
-            ):
+            if not self._uses_new_env_runners:
                 update_fn = self.learner_group.should_module_be_updated_fn
                 self.workers.foreach_worker(
                     lambda w: w.set_is_policy_to_train(update_fn),
@@ -3031,11 +3034,7 @@ def _run_one_evaluation(
         """
         eval_func_to_use = (
             self._evaluate_async_with_env_runner
-            if (
-                self.config.enable_async_evaluation
-                and self.config.env_runner_cls is not None
-                and not issubclass(self.config.env_runner_cls, RolloutWorker)
-            )
+            if (self.config.enable_async_evaluation and self._uses_new_env_runners)
             else self._evaluate_async
             if self.config.enable_async_evaluation
             else self.evaluate
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index fabde3ee8eb4e..0f29ba3939d79 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -86,18 +86,17 @@ class ImpalaConfig(AlgorithmConfig):
 
         # Update the config object.
         config = config.training(
-            lr=tune.grid_search([0.0001, ]), grad_clip=20.0
+            lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0
         )
         config = config.resources(num_gpus=0)
         config = config.rollouts(num_rollout_workers=1)
         # Set the config object's env.
         config = config.environment(env="CartPole-v1")
-        # Use to_dict() to get the old-style python config dict
-        # when running with tune.
+        # Run with tune.
         tune.Tuner(
             "IMPALA",
+            param_space=config,
             run_config=air.RunConfig(stop={"training_iteration": 1}),
-            param_space=config.to_dict(),
         ).fit()
 
     .. testoutput::
diff --git a/rllib/algorithms/pg/pg.py b/rllib/algorithms/pg/pg.py
index 390943f8fe143..b5cfa38044053 100644
--- a/rllib/algorithms/pg/pg.py
+++ b/rllib/algorithms/pg/pg.py
@@ -30,12 +30,11 @@ class PGConfig(AlgorithmConfig):
         >>> config = config.training(lr=tune.grid_search([0.001, 0.0001]))
         >>> # Set the config object's env.
         >>> config = config.environment(env="CartPole-v1")
-        >>> # Use to_dict() to get the old-style python config dict
-        >>> # when running with tune.
+        >>> # Run with tune.
         >>> tune.Tuner(  # doctest: +SKIP
         ...     "PG",
         ...     run_config=air.RunConfig(stop={"episode_reward_mean": 200}),
-        ...     param_space=config.to_dict(),
+        ...     param_space=config,
         ... ).fit()
     """
 
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index c394b96914d83..9f9605312e2e3 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -253,13 +253,10 @@ def training(
         # Pass kwargs onto super's `training()` method.
         super().training(**kwargs)
 
-        # TODO (sven): Move to generic AlgorithmConfig.
-        if lr_schedule is not NotProvided:
-            self.lr_schedule = lr_schedule
         if use_critic is not NotProvided:
             self.use_critic = use_critic
-            # TODO (Kourosh) This is experimental. Set learner_hps parameters as
-            # well. Don't forget to remove .use_critic from algorithm config.
+            # TODO (Kourosh) This is experimental.
+            #  Don't forget to remove .use_critic from algorithm config.
         if use_gae is not NotProvided:
             self.use_gae = use_gae
         if lambda_ is not NotProvided:
@@ -280,8 +277,6 @@ def training(
             self.vf_loss_coeff = vf_loss_coeff
         if entropy_coeff is not NotProvided:
             self.entropy_coeff = entropy_coeff
-        if entropy_coeff_schedule is not NotProvided:
-            self.entropy_coeff_schedule = entropy_coeff_schedule
         if clip_param is not NotProvided:
             self.clip_param = clip_param
         if vf_clip_param is not NotProvided:
@@ -289,6 +284,12 @@ def training(
         if grad_clip is not NotProvided:
             self.grad_clip = grad_clip
 
+        # TODO (sven): Remove these once new API stack is only option for PPO.
+        if lr_schedule is not NotProvided:
+            self.lr_schedule = lr_schedule
+        if entropy_coeff_schedule is not NotProvided:
+            self.entropy_coeff_schedule = entropy_coeff_schedule
+
         return self
 
     @override(AlgorithmConfig)
@@ -312,8 +313,8 @@ def validate(self) -> None:
             raise ValueError(
                 f"`sgd_minibatch_size` ({self.sgd_minibatch_size}) must be <= "
                 f"`train_batch_size` ({self.train_batch_size}). In PPO, the train batch"
-                f" is be split into {self.sgd_minibatch_size} chunks, each of which is "
-                f"iterated over (used for updating the policy) {self.num_sgd_iter} "
+                f" will be split into {self.sgd_minibatch_size} chunks, each of which "
+                f"is iterated over (used for updating the policy) {self.num_sgd_iter} "
                 "times."
             )
 
@@ -476,7 +477,6 @@ def training_step(self) -> ResultDict:
                 self.workers.local_worker().set_weights(weights)
 
         if self.config._enable_new_api_stack:
-
             kl_dict = {}
             if self.config.use_kl_loss:
                 for pid in policies_to_update:
diff --git a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py
index 12856f9d0d8c0..2b30c810568da 100644
--- a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py
+++ b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py
@@ -20,13 +20,15 @@ class PPOTfRLModule(TfRLModule, PPORLModule):
     def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]:
         output = {}
 
+        # Encoder forward pass.
         encoder_outs = self.encoder(batch)
         if STATE_OUT in encoder_outs:
             output[STATE_OUT] = encoder_outs[STATE_OUT]
 
-        # Actions
-        action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
-        output[SampleBatch.ACTION_DIST_INPUTS] = action_logits
+        # Pi head.
+        output[SampleBatch.ACTION_DIST_INPUTS] = self.pi(
+            encoder_outs[ENCODER_OUT][ACTOR]
+        )
 
         return output
 
@@ -34,8 +36,8 @@ def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]:
     def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]:
         """PPO forward pass during exploration.
 
-        Besides the action distribution, this method also returns the parameters of the
-        policy distribution to be used for computing KL divergence between the old
+        Besides the action distribution, this method also returns the parameters of
+        the policy distribution to be used for computing KL divergence between the old
         policy and the new policy during training.
         """
         output = {}
@@ -51,7 +53,6 @@ def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]:
 
         # Policy head
         action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
-
         output[SampleBatch.ACTION_DIST_INPUTS] = action_logits
 
         return output
@@ -60,16 +61,17 @@ def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]:
     def _forward_train(self, batch: NestedDict):
         output = {}
 
-        # Shared encoder
+        # Shared encoder.
         encoder_outs = self.encoder(batch)
         if STATE_OUT in encoder_outs:
             output[STATE_OUT] = encoder_outs[STATE_OUT]
 
-        # Value head
+        # Value head.
         vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC])
+        # Squeeze out last dim (value function node).
         output[SampleBatch.VF_PREDS] = tf.squeeze(vf_out, axis=-1)
 
-        # Policy head
+        # Policy head.
         action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
         output[SampleBatch.ACTION_DIST_INPUTS] = action_logits
 
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
index 09010c872c896..745f45bb603f6 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -20,21 +20,24 @@ class PPOTorchRLModule(TorchRLModule, PPORLModule):
     def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]:
         output = {}
 
+        # Encoder forward pass.
         encoder_outs = self.encoder(batch)
         if STATE_OUT in encoder_outs:
             output[STATE_OUT] = encoder_outs[STATE_OUT]
 
-        # Actions
-        action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
-        output[SampleBatch.ACTION_DIST_INPUTS] = action_logits
+        # Pi head.
+        output[SampleBatch.ACTION_DIST_INPUTS] = self.pi(
+            encoder_outs[ENCODER_OUT][ACTOR]
+        )
 
         return output
 
     @override(RLModule)
     def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]:
         """PPO forward pass during exploration.
-        Besides the action distribution, this method also returns the parameters of the
-        policy distribution to be used for computing KL divergence between the old
+
+        Besides the action distribution, this method also returns the parameters of
+        the policy distribution to be used for computing KL divergence between the old
         policy and the new policy during training.
         """
         output = {}
@@ -58,16 +61,17 @@ def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]:
     def _forward_train(self, batch: NestedDict) -> Dict[str, Any]:
         output = {}
 
-        # Shared encoder
+        # Shared encoder.
         encoder_outs = self.encoder(batch)
         if STATE_OUT in encoder_outs:
             output[STATE_OUT] = encoder_outs[STATE_OUT]
 
-        # Value head
+        # Value head.
         vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC])
+        # Squeeze out last dim (value function node).
         output[SampleBatch.VF_PREDS] = vf_out.squeeze(-1)
 
-        # Policy head
+        # Policy head.
         action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
         output[SampleBatch.ACTION_DIST_INPUTS] = action_logits
 
diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index 76e6f952b91f2..86f0649d66a39 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -184,29 +184,21 @@ def append(self, connector: ConnectorV2) -> None:
         )
 
     @override(ConnectorV2)
-    def get_state(self):
-        children = []
-        for c in self.connectors:
-            state = c.serialize()
-            assert isinstance(state, tuple) and len(state) == 2, (
-                "Serialized connector state must be in the format of "
-                f"Tuple[name: str, params: Any]. Instead we got {state}"
-                f"for connector {c.__name__}."
-            )
-            children.append(state)
-        return ConnectorPipelineV2.__name__, children
+    def get_state(self) -> Dict[str, Any]:
+        states = {}
+        for i, connector in enumerate(self.connectors):
+            key = f"{i:03d}_{type(connector).__name__}"
+            state = connector.get_state()
+            states[key] = state
+        return states
 
     @override(ConnectorV2)
     def set_state(self, state: Dict[str, Any]) -> None:
-        connectors = []
-        for state in params:
-            try:
-                name, subparams = state
-                connectors.append(get_connector(name, ctx, subparams))
-            except Exception as e:
-                logger.error(f"Failed to de-serialize connector state: {state}")
-                raise e
-        return ConnectorPipelineV2(ctx, connectors)
+        for i, connector in enumerate(self.connectors):
+            key = f"{i:03d}_{type(connector).__name__}"
+            if key not in state:
+                raise KeyError(f"No state found in `state` for connector piece: {key}!")
+            connector.set_state(state[key])
 
     def __repr__(self, indentation: int = 0):
         return "\n".join(
diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py
index 6e229b5f299a8..c022909120794 100644
--- a/rllib/core/learner/torch/torch_learner.py
+++ b/rllib/core/learner/torch/torch_learner.py
@@ -215,6 +215,7 @@ def get_parameters(self, module: RLModule) -> Sequence[Param]:
     @override(Learner)
     def _convert_batch_type(self, batch: MultiAgentBatch) -> MultiAgentBatch:
         batch = convert_to_torch_tensor(batch.policy_batches, device=self._device)
+        # TODO (sven): This computation of `env_steps` is not accurate!
         length = max(len(b) for b in batch.values())
         batch = MultiAgentBatch(batch, env_steps=length)
         return batch
diff --git a/rllib/core/models/catalog.py b/rllib/core/models/catalog.py
index b956343babae7..aaf0f1d9fb83d 100644
--- a/rllib/core/models/catalog.py
+++ b/rllib/core/models/catalog.py
@@ -55,7 +55,6 @@ class Catalog:
         from ray.rllib.core.models.configs import MLPHeadConfig
         from ray.rllib.core.models.catalog import Catalog
 
-
         class MyCatalog(Catalog):
             def __init__(
                 self,
@@ -64,17 +63,19 @@ def __init__(
                 model_config_dict: dict,
             ):
                 super().__init__(observation_space, action_space, model_config_dict)
-                self.my_model_config_dict = MLPHeadConfig(
+                self.my_model_config = MLPHeadConfig(
                     hidden_layer_dims=[64, 32],
                     input_dims=[self.observation_space.shape[0]],
                 )
 
             def build_my_head(self, framework: str):
-                return self.my_model_config_dict.build(framework=framework)
+                return self.my_model_config.build(framework=framework)
 
         # With that, RLlib can build and use models from this catalog like this:
         catalog = MyCatalog(gym.spaces.Box(0, 1), gym.spaces.Box(0, 1), {})
-        my_head = catalog.build_my_head("torch")
+        my_head = catalog.build_my_head(framework="torch")
+
+        # Make a call to the built model.
         out = my_head(torch.Tensor([[1]]))
     """
 
@@ -348,7 +349,7 @@ def get_tokenizer_config(
     ) -> ModelConfig:
         """Returns a tokenizer config for the given space.
 
-        This is useful for recurrent / tranformer models that need to tokenize their
+        This is useful for recurrent / transformer models that need to tokenize their
         inputs. By default, RLlib uses the models supported by Catalog out of the box to
         tokenize.
 
diff --git a/rllib/core/models/torch/encoder.py b/rllib/core/models/torch/encoder.py
index dd90c5af02a35..5d5ee38ed8d5b 100644
--- a/rllib/core/models/torch/encoder.py
+++ b/rllib/core/models/torch/encoder.py
@@ -175,7 +175,7 @@ def __init__(self, config: RecurrentEncoderConfig) -> None:
         assert len(gru_input_dims) == 1
         gru_input_dim = gru_input_dims[0]
 
-        # Create the torch LSTM layer.
+        # Create the torch GRU layer.
         self.gru = nn.GRU(
             gru_input_dim,
             config.hidden_dim,
diff --git a/rllib/utils/filter_manager.py b/rllib/utils/filter_manager.py
index e4b71af66d09e..8bcba09793421 100644
--- a/rllib/utils/filter_manager.py
+++ b/rllib/utils/filter_manager.py
@@ -29,7 +29,7 @@ def synchronize(
 
         Args:
             local_filters: Filters to be synchronized.
-            remotes: Remote evaluators with filters.
+            worker_set: WorkerSet with remote EnvRunners with filters.
             update_remote: Whether to push updates from the local filters to the remote
                 workers' filters.
             timeout_seconds: How long to wait for filter to get or set filters
diff --git a/rllib/utils/numpy.py b/rllib/utils/numpy.py
index 9f040c8a0c286..944d4b758c8c4 100644
--- a/rllib/utils/numpy.py
+++ b/rllib/utils/numpy.py
@@ -7,11 +7,7 @@
 
 
 from ray.rllib.utils.annotations import PublicAPI
-from ray.rllib.utils.deprecation import (
-    DEPRECATED_VALUE,
-    deprecation_warning,
-    Deprecated,
-)
+from ray.rllib.utils.deprecation import Deprecated
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.typing import SpaceStruct, TensorType, TensorStructType, Union
 
@@ -122,9 +118,7 @@ def concat_aligned(
 
 
 @PublicAPI
-def convert_to_numpy(
-    x: TensorStructType, reduce_type: bool = True, reduce_floats=DEPRECATED_VALUE
-):
+def convert_to_numpy(x: TensorStructType, reduce_type: bool = True) -> TensorStructType:
     """Converts values in `stats` to non-Tensor numpy or python types.
 
     Args:
@@ -139,10 +133,6 @@ def convert_to_numpy(
         values converted to numpy arrays (on CPU).
     """
 
-    if reduce_floats != DEPRECATED_VALUE:
-        deprecation_warning(old="reduce_floats", new="reduce_types", error=True)
-        reduce_type = reduce_floats
-
     # The mapping function used to numpyize torch/tf Tensors (and move them
     # to the CPU beforehand).
     def mapping(item):
diff --git a/rllib/utils/tests/test_minibatch_utils.py b/rllib/utils/tests/test_minibatch_utils.py
index a8d8180d05129..0256e9ffab311 100644
--- a/rllib/utils/tests/test_minibatch_utils.py
+++ b/rllib/utils/tests/test_minibatch_utils.py
@@ -93,8 +93,8 @@ def test_minibatch_cyclic_iterator(self):
                         check(policy_batch.count, mini_batch_size)
                     iteration_counter += 1
 
-                # for each policy check that the last item in batch matches the expected
-                # values, i.e. iteration_counter * mini_batch_size % agent_steps - 1
+                # For each policy check that the last item in batch matches the expected
+                # values, i.e. iteration_counter * mini_batch_size % agent_steps - 1.
                 total_steps = iteration_counter * mini_batch_size
                 for policy_idx, policy_batch in enumerate(
                     batch.policy_batches.values()
@@ -104,8 +104,8 @@ def test_minibatch_cyclic_iterator(self):
                         expected_last_item = 0.0
                     check(policy_batch["obs"][-1], expected_last_item)
 
-                # check iteration counter (should be
-                # ceil(num_gsd_iter * max(agent_steps) / mini_batch_size))
+                # Check iteration counter (should be
+                # ceil(num_gsd_iter * max(agent_steps) / mini_batch_size)).
                 expected_iteration_counter = np.ceil(
                     num_sgd_iter * max(agent_steps) / mini_batch_size
                 )
diff --git a/rllib/utils/torch_utils.py b/rllib/utils/torch_utils.py
index 0a56abf83a502..68c8ebda458e3 100644
--- a/rllib/utils/torch_utils.py
+++ b/rllib/utils/torch_utils.py
@@ -217,8 +217,8 @@ def convert_to_torch_tensor(x: TensorStructType, device: Optional[str] = None):
 
     Returns:
         Any: A new struct with the same structure as `x`, but with all
-            values converted to torch Tensor types. This does not convert possibly
-            nested elements that are None because torch has no representation for that.
+        values converted to torch Tensor types. This does not convert possibly
+        nested elements that are None because torch has no representation for that.
     """
 
     def mapping(item):