From 57e79f9a759d5ad1a4894cf8acb58bf7088778f1 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 Nov 2023 19:25:10 +0100 Subject: [PATCH 01/15] wip Signed-off-by: sven1977 --- rllib/connectors/connector_context_v2.py | 66 +++++ rllib/connectors/connector_pipeline_v2.py | 259 ++++++++++++++++++ rllib/connectors/connector_v2.py | 93 +++++++ rllib/connectors/env_to_module/__init__.py | 5 + .../env_to_module/default_env_to_module.py | 69 +++++ rllib/connectors/input_output_types.py | 75 +++++ rllib/connectors/learner/__init__.py | 0 .../learner/default_learner_connector.py | 212 ++++++++++++++ rllib/connectors/module_to_env/__init__.py | 0 .../module_to_env/default_module_to_env.py | 95 +++++++ .../tests/test_from_module_connectors.py | 106 +++++++ 11 files changed, 980 insertions(+) create mode 100644 rllib/connectors/connector_context_v2.py create mode 100644 rllib/connectors/connector_pipeline_v2.py create mode 100644 rllib/connectors/connector_v2.py create mode 100644 rllib/connectors/env_to_module/__init__.py create mode 100644 rllib/connectors/env_to_module/default_env_to_module.py create mode 100644 rllib/connectors/input_output_types.py create mode 100644 rllib/connectors/learner/__init__.py create mode 100644 rllib/connectors/learner/default_learner_connector.py create mode 100644 rllib/connectors/module_to_env/__init__.py create mode 100644 rllib/connectors/module_to_env/default_module_to_env.py create mode 100644 rllib/connectors/tests/test_from_module_connectors.py diff --git a/rllib/connectors/connector_context_v2.py b/rllib/connectors/connector_context_v2.py new file mode 100644 index 0000000000000..628691a9d28f9 --- /dev/null +++ b/rllib/connectors/connector_context_v2.py @@ -0,0 +1,66 @@ +from dataclasses import dataclass +from typing import Any, Optional + +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.typing import AgentID, EnvType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +@dataclass +class ConnectorContextV2: + """Information needed by pieces of connector pipeline to communicate with each other. + + ConnectorContextV2 will be passed to each connector (pipeline) call. + Also might contain references to the RLModule used, the Env, as well as whether + `explore` is True or False (whether forward_exploration or forward_inference was + used). + + TODO: Describe use cases, e.g. + - state out need to be fed back as state ins. + Unless we would like to temporarily store the states in the episode. + - agent_to_policy_mappings need to be stored as they might be stochastic. Then the + to_env pipeline can properly map back from module (formerly known as policy) IDs + to agent IDs. + + Attributes: + env: The Env object used to reset/step through in the current Env -> Module + setup. + rl_module: The RLModule used for forward passes in the current Env -> Module + setup. + explore: Whether `explore` is currently on. Per convention, if True, the + RLModule's `forward_exploration` method should be called, if False, the + EnvRunner should call `forward_inference` instead. + agent_id: The (optional) current agent ID that the connector should be + creating/extracting data for. + episode_index: The (optional) index within the list of SingleAgentEpisodes or + MultiAgentEpisodes, which each connector is given in a call, that belongs + to the given agent_id. + data: Optional additional context data that needs to be exchanged between + different Connector pieces and -pipelines. + """ + + env: Optional[EnvType] = None + rl_module: Optional[RLModule] = None + explore: Optional[bool] = None + data: Optional[Any] = None + + # TODO (sven): Do these have to be here?? + agent_id: Optional[AgentID] = None + episode_index: Optional[int] = None + + def add_data(self, key, value): + assert key not in self.data + self.data[key] = value + + def get_data(self, key): + assert key in self.data + return self.data[key] + + def override_data(self, key, value): + assert key in self.data + self.data[key] = value + + def del_data(self, key): + assert key in self.data + del self.data[key] diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py new file mode 100644 index 0000000000000..f5c6c1c181b52 --- /dev/null +++ b/rllib/connectors/connector_pipeline_v2.py @@ -0,0 +1,259 @@ +from collections import defaultdict +import logging +from typing import Any, List, Optional, Union + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule +from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI +from ray.util.timer import _Timer + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="alpha") +class ConnectorPipelineV2(ConnectorV2): + """Utility class for quick manipulation of a connector pipeline.""" + + def __init__( + self, + *, + ctx: ConnectorContextV2, + connectors: Optional[List[ConnectorV2]] = None, + **kwargs, + ): + super().__init__(ctx=ctx, **kwargs) + + self.connectors = connectors or [] + self._fix_input_output_types() + + self.timers = defaultdict(_Timer) + + def remove(self, name: str): + """Remove a connector piece by . + + Args: + name: The name of the connector piece to be removed from the pipeline. + """ + idx = -1 + for i, c in enumerate(self.connectors): + if c.__class__.__name__ == name: + idx = i + break + if idx >= 0: + del self.connectors[idx] + self._fix_input_output_types() + logger.info(f"Removed connector {name} from {self.__class__.__name__}.") + else: + logger.warning(f"Trying to remove a non-existent connector {name}.") + + def insert_before(self, name: str, connector: ConnectorV2): + """Insert a new connector before connector + + Args: + name: name of the connector before which a new connector + will get inserted. + connector: a new connector to be inserted. + """ + idx = -1 + for idx, c in enumerate(self.connectors): + if c.__class__.__name__ == name: + break + if idx < 0: + raise ValueError(f"Can not find connector {name}") + self.connectors.insert(idx, connector) + self._fix_input_output_types() + + logger.info( + f"Inserted {connector.__class__.__name__} before {name} " + f"to {self.__class__.__name__}." + ) + + def insert_after(self, name: str, connector: ConnectorV2): + """Insert a new connector after connector + + Args: + name: name of the connector after which a new connector + will get inserted. + connector: a new connector to be inserted. + """ + idx = -1 + for idx, c in enumerate(self.connectors): + if c.__class__.__name__ == name: + break + if idx < 0: + raise ValueError(f"Can not find connector {name}") + self.connectors.insert(idx + 1, connector) + self._fix_input_output_types() + + logger.info( + f"Inserted {connector.__class__.__name__} after {name} " + f"to {self.__class__.__name__}." + ) + + def prepend(self, connector: ConnectorV2): + """Append a new connector at the beginning of a connector pipeline. + + Args: + connector: a new connector to be appended. + """ + self.connectors.insert(0, connector) + self._fix_input_output_types() + + logger.info( + f"Added {connector.__class__.__name__} to the beginning of " + f"{self.__class__.__name__}." + ) + + def append(self, connector: ConnectorV2): + """Append a new connector at the end of a connector pipeline. + + Args: + connector: a new connector to be appended. + """ + self.connectors.append(connector) + self._fix_input_output_types() + + logger.info( + f"Added {connector.__class__.__name__} to the end of " + f"{self.__class__.__name__}." + ) + + def __call__( + self, + input_: Any, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + ) -> Any: + ret = input_ + for connector in self.connectors: + timer = self.timers[str(connector)] + with timer: + ret = connector(input_=ret, episodes=episodes, ctx=ctx) + return ret + + # @override(ConnectorV2) + # def serialize(self): + # children = [] + # for c in self.connectors: + # state = c.serialize() + # assert isinstance(state, tuple) and len(state) == 2, ( + # "Serialized connector state must be in the format of " + # f"Tuple[name: str, params: Any]. Instead we got {state}" + # f"for connector {c.__name__}." + # ) + # children.append(state) + # return ConnectorPipelineV2.__name__, children + # + # @override(ConnectorV2) + # @staticmethod + # def from_state(ctx: ConnectorContextV2, params: List[Any]): + # assert ( + # type(params) == list + # ), "AgentConnectorPipeline takes a list of connector params." + # connectors = [] + # for state in params: + # try: + # name, subparams = state + # connectors.append(get_connector(name, ctx, subparams)) + # except Exception as e: + # logger.error(f"Failed to de-serialize connector state: {state}") + # raise e + # return ConnectorPipelineV2(ctx, connectors) + + def __str__(self, indentation: int = 0): + return "\n".join( + [" " * indentation + self.__class__.__name__] + + [c.__str__(indentation + 4) for c in self.connectors] + ) + + def __getitem__(self, key: Union[str, int, type]): + """Returns a list of connectors that fit 'key'. + + If key is a number n, we return a list with the nth element of this pipeline. + If key is a Connector class or a string matching the class name of a + Connector class, we return a list of all connectors in this pipeline matching + the specified class. + + Args: + key: The key to index by + + Returns: The Connector at index `key`. + """ + # In case key is a class + if not isinstance(key, str): + if isinstance(key, slice): + raise NotImplementedError( + "Slicing of ConnectorPipeline is currently not supported." + ) + elif isinstance(key, int): + return [self.connectors[key]] + elif isinstance(key, type): + results = [] + for c in self.connectors: + if issubclass(c.__class__, key): + results.append(c) + return results + else: + raise NotImplementedError( + "Indexing by {} is currently not supported.".format(type(key)) + ) + + results = [] + for c in self.connectors: + if c.__class__.__name__ == key: + results.append(c) + + return results + + def _fix_input_output_types(self): + if len(self.connectors) > 0: + self.input_type = self.connectors[0].input_type + self.output_type = self.connectors[-1].output_type + else: + self.input_type = None + self.output_type = None + + +class EnvToModulePipeline(ConnectorPipelineV2): + def __init__( + self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs + ): + super().__init__(ctx=ctx, connectors=connectors, **kwargs) + # Add the default final connector piece for env-to-module pipelines: + # Extracting last obs from episodes and add them to input, iff this has not + # happened in any connector piece in this pipeline before. + if ( + len(self.connectors) == 0 + or type(self.connectors[-1]) is not DefaultEnvToModule + ): + self.append(DefaultEnvToModule(ctx=ctx)) + + def __call__(self, *, input_: Optional[Any] = None, episodes, ctx, **kwargs): + # Make sure user does not necessarily send initial input into this pipeline. + # Might just be empty and to be populated from `episodes`. + return super().__call__( + input_=input_ or {}, + episodes=episodes, + ctx=ctx, + **kwargs, + ) + + +class ModuleToEnvPipeline(ConnectorPipelineV2): + def __init__( + self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs + ): + super().__init__(ctx=ctx, connectors=connectors, **kwargs) + + # Add the default final connector piece for env-to-module pipelines: + # Sampling actions from action_dist_inputs and add them to input, iff this has + # not happened in any connector piece in this pipeline before. + if ( + len(self.connectors) == 0 + or type(self.connectors[-1]) is not DefaultModuleToEnv + ): + self.append(DefaultModuleToEnv(ctx=ctx)) diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py new file mode 100644 index 0000000000000..ba18a422b36e4 --- /dev/null +++ b/rllib/connectors/connector_v2.py @@ -0,0 +1,93 @@ +import abc +from typing import Any, List, Tuple + +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.connectors.input_output_types import INPUT_OUTPUT_TYPES +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class ConnectorV2(abc.ABC): + """Connector base class. + + A connector performs a transformation step, either on envrionment data before it + gets to the RLModule, or on RLModule output before it is sent back to the + environment. + + Connectors may be training-aware, for example, behave slightly differently + during training and inference. + + All connectors are required to be serializable and implement the `serialize()` method. + """ + + # Set these in ALL subclasses. + input_type = INPUT_OUTPUT_TYPES.DATA + output_type = INPUT_OUTPUT_TYPES.DATA + + def __init__(self, *, ctx: ConnectorContextV2, **kwargs): + """Initializes a ConnectorV2 instance. + + Args: + ctx: The current ConnectorContextV2. + **kwargs: Forward API-compatibility kwargs. + """ + self.ctx = ctx + + @abc.abstractmethod + def __call__( + self, + *, + input_: Any, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + **kwargs, + ) -> Any: + """Method for transforming input data into output data. + + Args: + input_: The input data abiding to `self.input_type` to be transformed by + this connector. Transformations might either be done in-place or a new + structure may be returned that matches `self.output_type`. + episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects, + each corresponding to one slot in the vector env. Note that episodes + should always be considered read-only and not be altered. + ctx: The ConnectorContext that might be used to pass along other important + information in between connector pieces (even across pipelines). + kwargs: Forward API-compatibility kwargs. + + Returns: + The transformed connector output abiding to `self.output_type`. + """ + + def __str__(self, indentation: int = 0): + return " " * indentation + self.__class__.__name__ + + # @abc.abstractmethod + # def serialize(self) -> Tuple[str, Any]: + # """Serialize a connector into a JSON serializable Tuple. + + # `serialize()` is required, so that all Connectors are serializable. + + # Returns: + # A tuple of connector's name and its serialized states. + # String should match the name used to register the connector, + # while state can be any single data structure that contains the + # serialized state of the connector. If a connector is stateless, + # state can simply be None. + # """ + + # @staticmethod + # @abc.abstractmethod + # def from_state(ctx: ConnectorContextV2, params: Any) -> "ConnectorV2": + # """De-serialize a JSON params back into a Connector. + + # `from_state()` is required, so that all Connectors are serializable. + + # Args: + # ctx: ConnectorContextV2 for constructing this connector. + # params: Serialized states of the connector to be recovered. + + # Returns: + # De-serialized connector. + # """ diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py new file mode 100644 index 0000000000000..b86c2f9cb002f --- /dev/null +++ b/rllib/connectors/env_to_module/__init__.py @@ -0,0 +1,5 @@ +from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule + +__all__ = [ + "DefaultEnvToModule", +] diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py new file mode 100644 index 0000000000000..0b9eb2d8669a5 --- /dev/null +++ b/rllib/connectors/env_to_module/default_env_to_module.py @@ -0,0 +1,69 @@ +from typing import Any, List + +import numpy as np + +import tree +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.core.models.base import STATE_IN, STATE_OUT +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.spaces.space_utils import batch +from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class DefaultEnvToModule(ConnectorV2): + """Default env-to-module-connector always in the pipeline at the very end. + + Makes sure that there is at least an observation (the most recent one) for each + agent as well as a state - in case the RLModule is recurrent. Doesn't do anything + in case other pieces in the pipeline already take care of populating these fields. + + TODO: Generalize to MultiAgentEpisodes. + """ + + @override(ConnectorV2) + def __call__( + self, + input_: Any, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + **kwargs, + ): + # If obs are not already part of the input, add the most recent ones (from all + # single-agent episodes). + if SampleBatch.OBS not in input_: + observations = [] + for episode in episodes: + # Make sure, we have at least one observation in the episode. + assert len(episode.observations) > 0 + observations.append(episode.observations[-1]) + input_[SampleBatch.OBS] = batch(observations) + + # If our module is recurrent: + # - Add the most recent states to the inputs. + # - Make all inputs have T=1. + if ctx.rl_module.is_stateful(): + states = [] + for episode in episodes: + # Make sure, we have at least one observation in the episode. + assert episode.observations + + # TODO: Generalize to MultiAgentEpisodes. + # Episode just started, get initial state from our RLModule. + if len(episode) == 0: + state = ctx.rl_module.get_initial_state() + else: + state = episode.extra_model_outputs[STATE_OUT][-1] + states.append(state) + + # Make all other inputs have an additional T=1 axis. + input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_) + + # Batch states (from list of individual vector sub-env states). + # Note that state ins should NOT have the extra time dimension. + input_[STATE_IN] = batch(states) + + return input_ diff --git a/rllib/connectors/input_output_types.py b/rllib/connectors/input_output_types.py new file mode 100644 index 0000000000000..da9343c040678 --- /dev/null +++ b/rllib/connectors/input_output_types.py @@ -0,0 +1,75 @@ +from enum import Enum + + +class INPUT_OUTPUT_TYPES(Enum): + """Definitions of possible datatypes being processed by individual connectors. + + TODO: Make sure this is valid: + Each connector will always receive a list of Episodes (MultiAgentEpisodes or + SingleAgentEpisodes, depending on the setup and EnvRunner used). In addition, the + output of the previous connector (or an empty dict at the beginnnig) will be + received. + An IntoModule connector pipeline should eventually output a dict mapping module IDs + to SampleBatches + + Typical env-module-env pipeline: + env.step(List[Data]) -> List[MultiAgentEpisode] + + connector: auto-agent-extraction: List[MultiAgentEpisode] -> dict[AgentID, Data] + connector: auto-broadcast: Data -> Data (legacy postprocessing and filtering) + under the hood: dict[AgentID, Data] -> dict[AgentID, Data] + connector: auto-policy-mapping: dict[AgentID, Data] -> dict[ModuleID, Data] + + module.forward_exploration() -> dict[ModuleID, Data] + + connector: auto-action-sampling: dict[ModuleID, Data] -> dict[ModuleID, Data] + connector: action-clipping: Data -> Data + under the hood: dict[ModuleID, Data] -> dict[ModuleID, Data] + connector: auto-policy-unmapping: dict[ModuleID, Data] -> dict[AgentID, Data] + (using information stored in connector ctx) + connector: auto-action-sorting (using information stored in connector ctx): + dict[AgentID, Data] -> List[Data] + + env.step(List[Data]) ... repeats + + Typical training pipeline: + + + Default env-module-env pipeline picked by RLlib if no connector defined by user AND + module is RNN: + env.step(List[Data]) -> List[MultiAgentEpisode] + + connector: auto-agent-extraction: List[MultiAgentEpisode] -> dict[AgentID, Data] + connector: auto-policy-mapping: dict[AgentID, Data] -> dict[ModuleID, Data] + connector: auto-state-handling: dict[ModuleID, Data] -> + dict[ModuleID, Data + state] (using information stored in connector ctx) + + module.forward_exploration() -> dict[ModuleID, Data + state] + + connector: auto-state-handling: dict[ModuleID, Data + state] -> + dict[ModuleID, Data] (state was stored in ctx) + connector: auto-policy-unmapping: dict[ModuleID, Data] -> + dict[AgentID, Data] (using information stored in connector ctx) + connector: auto-action-sorting (using information stored in connector ctx): + dict[AgentID, Data] -> List[Data] + + env.step(List[Data]) ... repeats + """ + + # Normally, after `env.step()`, we have a list (vector env) of MultiAgentEpisodes + # as a starting point. + LIST_OF_MULTI_AGENT_EPISODES = 0 + # In the simplified case, there might be a list of SingleAgentEpisodes, instead. + LIST_OF_SINGLE_AGENT_EPISODES = 1 + + # From each MultiAgentEpisode, we might extract a dict, mapping agent IDs to data. + LIST_OF_DICTS_MAPPING_AGENT_IDS_TO_DATA = 10 + # Eventually boiling down to simply one dict mapping agent IDs to data. + # + DICT_MAPPING_AGENT_IDS_TO_DATA = 11 + + # Right after the module's forward pass, we usually have a single dict mapping + # Module IDs to data (model outputs). + DICT_MAPPING_MODULE_IDS_TO_DATA = 12 + + DATA = 11 diff --git a/rllib/connectors/learner/__init__.py b/rllib/connectors/learner/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py new file mode 100644 index 0000000000000..592faa711a4a6 --- /dev/null +++ b/rllib/connectors/learner/default_learner_connector.py @@ -0,0 +1,212 @@ +from functools import partial +from typing import Any + +import numpy as np +import tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.core.models.base import STATE_IN, STATE_OUT +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.numpy import convert_to_numpy + + +class DefaultLearnerConnector(ConnectorV2): + """Connector added by default by RLlib to the end of the learner connector pipeline. + + If provided with `episodes` data, this connector piece makes sure that the final + train batch going into the RLModule for updating (`forward_train()` call) contains + at the minimum: + - Observations: From all episodes under the SampleBatch.OBS key. + - Actions, rewards, terminal/truncation flags: From all episodes under the + respective keys. + - All data inside the episodes' `extra_model_outs` property, e.g. action logp and + action probs. + - States: If the RLModule is stateful, the episodes' STATE_OUTS will be extracted + and restructured under a new STATE_IN key in such a way that the resulting STATE_IN + batch has the shape (B', ...). Here, B' is the sum of splits we have to do over + the given episodes, such that each chunk is at most `max_seq_len` long (T-axis). + Also, all other data will be properly reshaped into (B, T=max_seq_len, ...) and + will be zero-padded, if necessary. + + If the user wants to customize their own data under the given keys (e.g. obs, + actions, ...), they can extract from the episodes or recompute from `input_` + their own data and store it under those keys (in `input_`). In such a case, this + connector will not touch the data under these keys. + """ + + def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): + # If episodes are provided, extract the essential data from them, but only if + # this data is not present yet in `input_`. + if not episodes: + return input_ + + # Get data dicts for all episodes. + data_dicts = [episode.get_data_dict() for episode in episodes] + + state_in = None + T = ctx.rl_module.config.model_config_dict.get("max_seq_len") + + # Special handling of STATE_OUT/STATE_IN keys: + if ctx.rl_module.is_stateful() and STATE_IN not in input_: + if T is None: + raise ValueError( + "You are using a stateful RLModule and are not providing custom " + f"'{STATE_IN}' data through your connector(s)! Therefore, you need " + "to provide the 'max_seq_len' key inside your model config dict. " + "You can set this dict and/or override keys in it via " + "`config.training(model={'max_seq_len': x})`." + ) + # Get model init state. + init_state = convert_to_numpy(ctx.rl_module.get_initial_state()) + # Get STATE_OUTs for all episodes and only keep those (as STATE_INs) that + # are located at the `max_seq_len` edges (state inputs to RNNs only have a + # B-axis, no T-axis). + state_ins = [] + for episode, data_dict in zip(episodes, data_dicts): + # Remove state outs (should not be part of the T-axis rearrangements). + state_outs = data_dict.pop(STATE_OUT) + state_ins.append( + tree.map_structure( + # [::T] = only keep every Tth (max_seq_len) state in. + # [:-1] = shift state outs by one (ignore very last state out, but + # therefore add the init state at the beginning). + lambda i, o: np.concatenate([[i], o[:-1]])[::T], + ( + # Episode has a (reset) beginning -> Prepend initial state. + init_state + if episode.t_started == 0 + # Episode starts somewhere in the middle (is a cut continuation + # chunk) -> Use previous chunk's last STATE_OUT as initial state. + else episode.get_extra_model_outputs( + key=STATE_OUT, indices=-len(episode) - 1 + ) + ), + state_outs, + ) + ) + # Concatenate the individual episodes' state ins. + state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins) + + # Before adding anything else to the `input_`, add the time axis to existing + # data. + input_ = tree.map_structure( + lambda s: split_and_pad_single_record(s, episodes, T=T), + input_, + ) + + # Set the reduce function for all the data we might still have to extract + # from our list of episodes. This function takes a list of data (e.g. obs) + # with each item in the list representing one episode and properly + # splits along the time axis and zero-pads if necessary (based on + # max_seq_len). + reduce_fn = partial(split_and_pad, T=T) + + # No stateful module, normal batch (w/o T-axis or zero-padding). + else: + # Set the reduce function for all the data we might still have to extract + # from our list of episodes. Simply concatenate the data from the different + # episodes along the batch axis (axis=0). + reduce_fn = np.concatenate + + # Extract all data from the episodes, if not already in `input_`. + for key in [ + SampleBatch.OBS, + SampleBatch.ACTIONS, + SampleBatch.REWARDS, + SampleBatch.TERMINATEDS, + SampleBatch.TRUNCATEDS, + SampleBatch.T, # TODO: remove (normally not needed in train batch) + *episodes[0].extra_model_outputs.keys(), + ]: + if key not in input_ and key != STATE_OUT: + # Concatenate everything together (along B-axis=0). + input_[key] = tree.map_structure( + lambda *s: reduce_fn(s), + *[d[key] for d in data_dicts], + ) + + # Infos (always as lists). + # TODO:uncomment if SampleBatch.INFOS not in input_: + # input_[SampleBatch.INFOS] = sum( + # [d[SampleBatch.INFOS] for d in data_dicts], + # [], + # ) + + if ctx.rl_module.is_stateful(): + # Now that all "normal" fields are time-dim'd and zero-padded, add + # the STATE_IN column to `input_`. + input_[STATE_IN] = state_in + # Create the zero-padding loss mask. + ( + input_["loss_mask"], + input_[SampleBatch.SEQ_LENS], + ) = create_mask_and_seq_lens( + episode_lens=[len(episode) for episode in episodes], + T=T, + ) + + return input_ + + +def split_and_pad(episodes_data, T): + all_chunks = [] + + for data in episodes_data: + num_chunks = int(np.ceil(data.shape[0] / T)) + + for i in range(num_chunks): + start_index = i * T + end_index = start_index + T + + # Extract the chunk + chunk = data[start_index:end_index] + + # Pad the chunk if it's shorter than T + if chunk.shape[0] < T: + padding_shape = [(0, T - chunk.shape[0])] + [ + (0, 0) for _ in range(chunk.ndim - 1) + ] + chunk = np.pad(chunk, pad_width=padding_shape, mode="constant") + + all_chunks.append(chunk) + + # Combine all chunks into a single array + result = np.concatenate(all_chunks, axis=0) + + # Reshape the array to include the time dimension T + # The new shape should be (-1, T) + original dimensions (excluding the batch dimension) + result = result.reshape((-1, T) + result.shape[1:]) + + return result + + +def split_and_pad_single_record(data, episodes, T): + episodes_data = [] + idx = 0 + for episode in episodes: + len_ = len(episode) + episodes_data.append(data[idx : idx + len_]) + idx += len_ + return split_and_pad(episodes_data, T) + + +def create_mask_and_seq_lens(episode_lens, T): + mask = [] + seq_lens = [] + for episode_len in episode_lens: + len_ = min(episode_len, T) + seq_lens.append(len_) + row = [1] * len_ + [0] * (T - len_) + mask.append(row) + + # Handle sequence lengths greater than T. + overflow = episode_len - T + while overflow > 0: + len_ = min(overflow, T) + seq_lens.append(len_) + extra_row = [1] * len_ + [0] * (T - len_) + mask.append(extra_row) + overflow -= T + + return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32) diff --git a/rllib/connectors/module_to_env/__init__.py b/rllib/connectors/module_to_env/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py new file mode 100644 index 0000000000000..5bf0a2af0c8a4 --- /dev/null +++ b/rllib/connectors/module_to_env/default_module_to_env.py @@ -0,0 +1,95 @@ +from typing import Any + +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.core.models.base import STATE_OUT +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class DefaultModuleToEnv(ConnectorV2): + """A connector that samples actions given action dist. inputs and a dist. class. + + The connector will only sample from the distribution, if the ACTIONS key + cannot be found in the connector's input. Otherwise, it'll behave simply as pass + through (noop). If ACTIONS is not present, but ACTION_DIST_INPUTS are, will create + a distribution from the RLModule and sample from it (deterministically, if + we are not exploring, stochastically, if we are). + + input_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA + Operates per RLModule as it will have to pull the action distribution from each + in order to sample actions if necessary. Searches for the ACTIONS and + ACTION_DIST_INPUTS keys in a module's outputs and - should ACTIONS not be found - + sample actions from the module's action distribution. + output_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA (same as input: data in, + data out, however, data + out might contain an additional ACTIONS key if it was not previously present + in the input). + """ + + @override(ConnectorV2) + def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any: + + # Loop through all modules that created some output. + # for mid in input_.keys(): + # sa_module = ctx.rl_module.get_module(module_id=mid) + + # If our RLModule is stateful, remove the T=1 axis from all model outputs + # (except the state outs, which never have this extra time axis). + if ctx.rl_module.is_stateful(): + state = input_.pop(STATE_OUT, None) + input_ = tree.map_structure(lambda s: np.squeeze(s, axis=1), input_) + if state: + input_[STATE_OUT] = state + + # ACTION_DIST_INPUTS field returned by `forward_exploration()` -> + # Create a distribution object. + action_dist = None + # The RLModule has already computed actions. + if ( + SampleBatch.ACTION_DIST_INPUTS in input_ + and SampleBatch.ACTION_LOGP not in input_ + ): + dist_inputs = input_[SampleBatch.ACTION_DIST_INPUTS] + if ctx.explore: + action_dist_class = ctx.rl_module.get_exploration_action_dist_cls() + else: + action_dist_class = ctx.rl_module.get_inference_action_dist_cls() + action_dist = action_dist_class.from_logits(dist_inputs) + if not ctx.explore: + action_dist = action_dist.to_deterministic() + + # If `forward_...()` returned actions, use them here as-is. + if SampleBatch.ACTIONS in input_: + actions = input_[SampleBatch.ACTIONS] + # Otherwise, sample actions from the distribution. + else: + if action_dist is None: + raise KeyError( + "Your RLModule's `forward_[explore|inference]()` methods must " + f"return a dict with either the {SampleBatch.ACTIONS} key or " + f"the {SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" + ) + actions = action_dist.sample() + input_[SampleBatch.ACTIONS] = actions + + # Compute action-logp and action-prob from distribution and add to + # output, if possible. + if action_dist is not None and SampleBatch.ACTION_LOGP not in input_: + input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions) + + return input_ + + # @override(Connector) + # def serialize(self): + # return ClipActions.__name__, None + + # @staticmethod + # TODO + # def from_state(ctx: ConnectorContext, params: Any): + # return ClipActions(ctx) diff --git a/rllib/connectors/tests/test_from_module_connectors.py b/rllib/connectors/tests/test_from_module_connectors.py new file mode 100644 index 0000000000000..ac0844ff46f0f --- /dev/null +++ b/rllib/connectors/tests/test_from_module_connectors.py @@ -0,0 +1,106 @@ +import unittest + +import gymnasium as gym +import numpy as np + +from ray.rllib.connectors.into_env.clip_actions import ClipActions +from ray.rllib.connectors.into_env.unsquash_actions import UnsquashActions +from ray.rllib.connectors.connector import ConnectorContextV2 +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.test_utils import check + + +class TestFromModuleConnectors(unittest.TestCase): + def test_connector_pipeline(self): + ctx = ConnectorContext() + connectors = [ConvertToNumpyConnector(ctx)] + pipeline = ActionConnectorPipeline(ctx, connectors) + name, params = pipeline.serialize() + restored = get_connector(name, ctx, params) + self.assertTrue(isinstance(restored, ActionConnectorPipeline)) + self.assertTrue(isinstance(restored.connectors[0], ConvertToNumpyConnector)) + # There should not be any timer yet + self.assertFalse(bool(pipeline.timers.values())) + pipeline(ActionConnectorDataType(0, 0, {}, ([1], [], None))) + # After a first input, there should be one timer + self.assertEquals(len(pipeline.timers.values()), 1) + + def test_clip_actions_connector(self): + ctx = ConnectorContextV2() + + connector = ClipActions( + action_space=gym.spaces.Box(low=0.0, high=6.0, shape=(1,)) + ) + + # name, params = connector.serialize() + # self.assertEqual(name, "ClipActions") + + # restored = get_connector(name, ctx, params) + # self.assertTrue(isinstance(restored, ClipActionsConnector)) + + for action in [8.8, 6.0, -0.2, 0.0, 5.9999, 3.2, 6.1]: + output = connector( + {SampleBatch.ACTIONS: np.array([action])}, + ctx, + ) + check(output[SampleBatch.ACTIONS], np.clip(action, 0.0, 6.0)) + + connector = ClipActions( + action_space=gym.spaces.Dict( + { + "a": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,)), + "b": gym.spaces.Discrete(3), + } + ) + ) + for action in [ + {"a": np.array([8.8, 8.9]), "b": 1}, + {"a": np.array([9.0, -1.0]), "b": 0}, + {"a": np.array([100.0, 200.0]), "b": 2}, + {"a": np.array([-1000, 0.0001]), "b": 2}, + {"a": np.array([0.4, 1.2]), "b": 0}, + {"a": np.array([1.0, -1.0]), "b": 1}, + ]: + output = connector({SampleBatch.ACTIONS: action}, ctx) + check( + output[SampleBatch.ACTIONS], + {"a": np.clip(action["a"], -1.0, 1.0), "b": action["b"]}, + ) + + def test_unsquash_actions_connector(self): + ctx = ConnectorContextV2() + + connector = UnsquashActions( + action_space=gym.spaces.Box(low=-2.0, high=6.0, shape=(2,)) + ) + + # name, params = connector.serialize() + # self.assertEqual(name, "UnsquashActions") + + # restored = get_connector(name, ctx, params) + # self.assertTrue(isinstance(restored, NormalizeActionsConnector)) + + for action in [ + [1.8, 1.8], + [1.0, -1.0], + [-1.0, 1.1], + [0.0, 0.0], + [10.0, 0.5], + [0.5, -0.5], + ]: + action = np.array(action) + output = connector( + {SampleBatch.ACTIONS: action}, + ctx, + ) + check( + output[SampleBatch.ACTIONS], + np.clip((action + 1.0) * 4.0 - 2.0, -2.0, 6.0), + ) + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", __file__])) From 99d9019b735258376235c85832514fe916081b0a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 17 Nov 2023 11:47:52 +0100 Subject: [PATCH 02/15] wip Signed-off-by: sven1977 --- rllib/connectors/connector_context_v2.py | 16 +- rllib/connectors/connector_pipeline_v2.py | 82 +++------- rllib/connectors/connector_v2.py | 102 +++++++----- .../env_to_module/default_env_to_module.py | 46 +++--- .../env_to_module/env_to_module_pipeline.py | 47 ++++++ .../learner/default_learner_connector.py | 148 ++++++------------ .../module_to_env/default_module_to_env.py | 68 ++++---- .../module_to_env/module_to_env_pipeline.py | 27 ++++ rllib/connectors/utils/__init__.py | 0 rllib/connectors/utils/zero_padding.py | 135 ++++++++++++++++ 10 files changed, 413 insertions(+), 258 deletions(-) create mode 100644 rllib/connectors/env_to_module/env_to_module_pipeline.py create mode 100644 rllib/connectors/module_to_env/module_to_env_pipeline.py create mode 100644 rllib/connectors/utils/__init__.py create mode 100644 rllib/connectors/utils/zero_padding.py diff --git a/rllib/connectors/connector_context_v2.py b/rllib/connectors/connector_context_v2.py index 628691a9d28f9..fff114618dd15 100644 --- a/rllib/connectors/connector_context_v2.py +++ b/rllib/connectors/connector_context_v2.py @@ -24,13 +24,15 @@ class ConnectorContextV2: to agent IDs. Attributes: - env: The Env object used to reset/step through in the current Env -> Module - setup. - rl_module: The RLModule used for forward passes in the current Env -> Module - setup. + env: The Env object used to reset/step through in the current Env->Module + setup. This will be None in contexts used in a Learner connector pipeline. + rl_module: The RLModule used for either action computing forward passes + (`forward_exploration|inference()`) in the current Env->Module setup + or `forward_train()` calls in a Learner connector pipeline. explore: Whether `explore` is currently on. Per convention, if True, the - RLModule's `forward_exploration` method should be called, if False, the - EnvRunner should call `forward_inference` instead. + RLModule's `forward_exploration()` method should be called, if False, the + EnvRunner should call `forward_inference()` instead. Should be None inside + Learner connector pipelines. agent_id: The (optional) current agent ID that the connector should be creating/extracting data for. episode_index: The (optional) index within the list of SingleAgentEpisodes or @@ -38,6 +40,8 @@ class ConnectorContextV2: to the given agent_id. data: Optional additional context data that needs to be exchanged between different Connector pieces and -pipelines. + + TODO (sven): Maybe we should have to AlgorithmConfig here as well. """ env: Optional[EnvType] = None diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index f5c6c1c181b52..f3d9c36508682 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -1,6 +1,6 @@ from collections import defaultdict import logging -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Type, Union from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 @@ -32,8 +32,24 @@ def __init__( self.timers = defaultdict(_Timer) - def remove(self, name: str): - """Remove a connector piece by . + @override(ConnectorV2) + def __call__( + self, + input_: Any, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + **kwargs, + ) -> Any: + """""" + ret = input_ + for connector in self.connectors: + timer = self.timers[str(connector)] + with timer: + ret = connector(input_=ret, episodes=episodes, ctx=ctx) + return ret + + def remove(self, name_or_class: Union[str, Type]): + """Remove a single connector piece in this pipeline by its name or class. Args: name: The name of the connector piece to be removed from the pipeline. @@ -50,7 +66,7 @@ def remove(self, name: str): else: logger.warning(f"Trying to remove a non-existent connector {name}.") - def insert_before(self, name: str, connector: ConnectorV2): + def insert_before(self, name_or_class: Union[str, Type], connector: ConnectorV2): """Insert a new connector before connector Args: @@ -72,7 +88,7 @@ def insert_before(self, name: str, connector: ConnectorV2): f"to {self.__class__.__name__}." ) - def insert_after(self, name: str, connector: ConnectorV2): + def insert_after(self, name_or_class: Union[str, Type], connector: ConnectorV2): """Insert a new connector after connector Args: @@ -122,19 +138,6 @@ def append(self, connector: ConnectorV2): f"{self.__class__.__name__}." ) - def __call__( - self, - input_: Any, - episodes: List[EpisodeType], - ctx: ConnectorContextV2, - ) -> Any: - ret = input_ - for connector in self.connectors: - timer = self.timers[str(connector)] - with timer: - ret = connector(input_=ret, episodes=episodes, ctx=ctx) - return ret - # @override(ConnectorV2) # def serialize(self): # children = [] @@ -179,7 +182,7 @@ def __getitem__(self, key: Union[str, int, type]): the specified class. Args: - key: The key to index by + key: The key to index by. Returns: The Connector at index `key`. """ @@ -216,44 +219,3 @@ def _fix_input_output_types(self): else: self.input_type = None self.output_type = None - - -class EnvToModulePipeline(ConnectorPipelineV2): - def __init__( - self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs - ): - super().__init__(ctx=ctx, connectors=connectors, **kwargs) - # Add the default final connector piece for env-to-module pipelines: - # Extracting last obs from episodes and add them to input, iff this has not - # happened in any connector piece in this pipeline before. - if ( - len(self.connectors) == 0 - or type(self.connectors[-1]) is not DefaultEnvToModule - ): - self.append(DefaultEnvToModule(ctx=ctx)) - - def __call__(self, *, input_: Optional[Any] = None, episodes, ctx, **kwargs): - # Make sure user does not necessarily send initial input into this pipeline. - # Might just be empty and to be populated from `episodes`. - return super().__call__( - input_=input_ or {}, - episodes=episodes, - ctx=ctx, - **kwargs, - ) - - -class ModuleToEnvPipeline(ConnectorPipelineV2): - def __init__( - self, *, ctx, connectors: Optional[List[ConnectorV2]] = None, **kwargs - ): - super().__init__(ctx=ctx, connectors=connectors, **kwargs) - - # Add the default final connector piece for env-to-module pipelines: - # Sampling actions from action_dist_inputs and add them to input, iff this has - # not happened in any connector piece in this pipeline before. - if ( - len(self.connectors) == 0 - or type(self.connectors[-1]) is not DefaultModuleToEnv - ): - self.append(DefaultModuleToEnv(ctx=ctx)) diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index ba18a422b36e4..0c80ab64d2228 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -1,5 +1,5 @@ import abc -from typing import Any, List, Tuple +from typing import Any, Dict, List, Tuple from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.connectors.input_output_types import INPUT_OUTPUT_TYPES @@ -9,19 +9,45 @@ @PublicAPI(stability="alpha") class ConnectorV2(abc.ABC): - """Connector base class. - - A connector performs a transformation step, either on envrionment data before it - gets to the RLModule, or on RLModule output before it is sent back to the - environment. - - Connectors may be training-aware, for example, behave slightly differently - during training and inference. - - All connectors are required to be serializable and implement the `serialize()` method. + """Base class defining the API for an individual "connector piece". + + A ConnectorV2 ("connector piece") is usually part of a series of pieces within + a "connector pipeline", which in itself also abides to this very API. + For example, you might have a connector pipeline consisting of two connector pieces, + A and B, both instances of subclasses of ConnectorV2 and each one performing a + particular transformation on their input data. The resulting connector pipeline + (A->B) itself also abides to this very ConnectorV2 API and could thus be part of yet + another, higher-level connector pipeline. + + Any ConnectorV2 instances (individual pieces or several connector pieces in a + pipeline) must be callable by overriding their `__call__()` method. When called, + they take the outputs of a previous connector piece (or an empty dict if there are + no previous pieces) as well as all the data collected thus far in the ongoing + episode(s) (only applies to connectors used in EnvRunners) or retrieved from a + replay buffer or from an environment sampling step (only applies to connectors used + in Learner pipelines). From this data (previous piece's output and possibly + episodes), a ConnectorV2 then performs a transformation step. + + There are 3 types of pipelines a ConnectorV2 can belong to: + 1) env-to-module: The connector transforms envrionment data before it gets to the + RLModule. + 2) module-to-env: The connector transforms RLModule outputs before they are sent + back to the environment (as actions). + 3) learner pipeline: The connector transforms data coming directly from an + environment sampling step or a replay buffer and will be sent into the RLModule's + `forward_train()` method afterwards to compute the loss inputs. + + Some connectors might be stateful, for example for keeping track of observation + filtering stats (mean and stddev values). States of all connectors and connector + pipelines are frequently being synchronized between the EnvRunners (owning the + env-to-module and module-to-env pipelines) and the Learners (owning the Learner + pipelines). """ # Set these in ALL subclasses. + # TODO (sven): Irrelevant for single-agent cases. Once multi-agent is supported + # by ConnectorV2, we need to elaborate more on the different input/output types. + # For single-agent, the types should always be just INPUT_OUTPUT_TYPES.DATA. input_type = INPUT_OUTPUT_TYPES.DATA output_type = INPUT_OUTPUT_TYPES.DATA @@ -29,7 +55,7 @@ def __init__(self, *, ctx: ConnectorContextV2, **kwargs): """Initializes a ConnectorV2 instance. Args: - ctx: The current ConnectorContextV2. + ctx: The initial ConnectorContextV2. **kwargs: Forward API-compatibility kwargs. """ self.ctx = ctx @@ -48,13 +74,14 @@ def __call__( Args: input_: The input data abiding to `self.input_type` to be transformed by this connector. Transformations might either be done in-place or a new - structure may be returned that matches `self.output_type`. + structure may be returned. The returned data must match + `self.output_type`. episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects, - each corresponding to one slot in the vector env. Note that episodes - should always be considered read-only and not be altered. - ctx: The ConnectorContext that might be used to pass along other important - information in between connector pieces (even across pipelines). - kwargs: Forward API-compatibility kwargs. + each corresponding to one slot in a gym.vector.Env. + ctx: The ConnectorContextV2, containing the current Env, RLModule, and other + context-relevant information. It can also be used to pass along + information between connector pieces (even across different pipelines). + **kwargs: Forward API-compatibility kwargs. Returns: The transformed connector output abiding to `self.output_type`. @@ -63,31 +90,22 @@ def __call__( def __str__(self, indentation: int = 0): return " " * indentation + self.__class__.__name__ - # @abc.abstractmethod - # def serialize(self) -> Tuple[str, Any]: - # """Serialize a connector into a JSON serializable Tuple. - - # `serialize()` is required, so that all Connectors are serializable. + def get_state(self) -> Dict[str, Any]: + """Returns the current state of this ConnectorV2. - # Returns: - # A tuple of connector's name and its serialized states. - # String should match the name used to register the connector, - # while state can be any single data structure that contains the - # serialized state of the connector. If a connector is stateless, - # state can simply be None. - # """ + Used for checkpointing (connectors may be stateful) as well as synchronization + between connectors that are run on the (distributed) EnvRunners vs those that + run on the (distributed) Learners. - # @staticmethod - # @abc.abstractmethod - # def from_state(ctx: ConnectorContextV2, params: Any) -> "ConnectorV2": - # """De-serialize a JSON params back into a Connector. - - # `from_state()` is required, so that all Connectors are serializable. + Returns: + A dict mapping str keys to state information. + """ + return {} - # Args: - # ctx: ConnectorContextV2 for constructing this connector. - # params: Serialized states of the connector to be recovered. + def set_state(self, state: Dict[str, Any]) -> None: + """Sets the state of this connector to the provided one. - # Returns: - # De-serialized connector. - # """ + Args: + state: The new state to set this connector to. + """ + pass diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py index 0b9eb2d8669a5..9d7616011b8c7 100644 --- a/rllib/connectors/env_to_module/default_env_to_module.py +++ b/rllib/connectors/env_to_module/default_env_to_module.py @@ -15,13 +15,18 @@ @PublicAPI(stability="alpha") class DefaultEnvToModule(ConnectorV2): - """Default env-to-module-connector always in the pipeline at the very end. + """Default connector piece added by RLlib to the end of any env-to-module pipeline. - Makes sure that there is at least an observation (the most recent one) for each - agent as well as a state - in case the RLModule is recurrent. Doesn't do anything - in case other pieces in the pipeline already take care of populating these fields. + Makes sure that the output data will have at the minimum: + a) An observation (the most recent one returned by `env.step()`) under the + SampleBatch.OBS key for each agent and + b) In case the RLModule is stateful, a STATE_IN key populated with the most recently + computed STATE_OUT. - TODO: Generalize to MultiAgentEpisodes. + The connector will not add any new data in case other connector pieces in the + pipeline already take care of populating these fields (obs and state in). + + TODO (sven): Generalize to MultiAgentEpisodes. """ @override(ConnectorV2) @@ -31,37 +36,40 @@ def __call__( episodes: List[EpisodeType], ctx: ConnectorContextV2, **kwargs, - ): - # If obs are not already part of the input, add the most recent ones (from all - # single-agent episodes). + ) -> Any: + # If observations cannot be found in `input`, add the most recent ones (from all + # episodes). if SampleBatch.OBS not in input_: + # Collect all most-recent observations from given episodes. observations = [] for episode in episodes: - # Make sure, we have at least one observation in the episode. - assert len(episode.observations) > 0 - observations.append(episode.observations[-1]) + observations.append(episode.get_observation(indices=-1)) + # Batch all collected observations together. input_[SampleBatch.OBS] = batch(observations) - # If our module is recurrent: - # - Add the most recent states to the inputs. - # - Make all inputs have T=1. + # If our module is stateful: + # - Add the most recent STATE_OUTs to `input_`. + # - Make all data in `input_` have a time rank (T=1). if ctx.rl_module.is_stateful(): + # Make all other inputs have an additional T=1 axis. + input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_) + + # Collect all most recently computed STATE_OUT (or use initial states from + # RLModule if at beginning of episode). states = [] for episode in episodes: # Make sure, we have at least one observation in the episode. assert episode.observations - # TODO: Generalize to MultiAgentEpisodes. - # Episode just started, get initial state from our RLModule. + # TODO (sven): Generalize to MultiAgentEpisodes. + # Episode just started -> Get initial state from our RLModule. if len(episode) == 0: state = ctx.rl_module.get_initial_state() + # Episode is already ongoing -> Use most recent STATE_OUT. else: state = episode.extra_model_outputs[STATE_OUT][-1] states.append(state) - # Make all other inputs have an additional T=1 axis. - input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_) - # Batch states (from list of individual vector sub-env states). # Note that state ins should NOT have the extra time dimension. input_[STATE_IN] = batch(states) diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py new file mode 100644 index 0000000000000..63630229e57bc --- /dev/null +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -0,0 +1,47 @@ +from typing import Any, List, Optional + +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 +from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType + + +class EnvToModulePipeline(ConnectorPipelineV2): + def __init__( + self, + *, + ctx: ConnectorContextV2, + connectors: Optional[List[ConnectorV2]] = None, + **kwargs, + ): + super().__init__(ctx=ctx, connectors=connectors, **kwargs) + # Add the default final connector piece for env-to-module pipelines: + # Extracting last obs from episodes and add them to input, iff this has not + # happened in any connector piece in this pipeline before. + if ( + len(self.connectors) == 0 + or type(self.connectors[-1]) is not DefaultEnvToModule + ): + self.append(DefaultEnvToModule(ctx=ctx)) + + @override(ConnectorPipelineV2) + def __call__( + self, + *, + input_: Optional[Any] = None, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + **kwargs, + ) -> Any: + # Make sure user does not have to send initial input into this pipeline. + # Might just be empty and to be populated from `episodes`. + return super().__call__( + input_=input_ or {}, + episodes=episodes, + ctx=ctx, + **kwargs, + ) + + diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py index 592faa711a4a6..9a636a0fc0d9c 100644 --- a/rllib/connectors/learner/default_learner_connector.py +++ b/rllib/connectors/learner/default_learner_connector.py @@ -1,18 +1,24 @@ from functools import partial -from typing import Any +from typing import Any, List import numpy as np import tree from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.connectors.utils.zero_padding import ( + create_mask_and_seq_lens, + split_and_pad, + split_and_pad_single_record, +) from ray.rllib.core.models.base import STATE_IN, STATE_OUT from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import EpisodeType class DefaultLearnerConnector(ConnectorV2): - """Connector added by default by RLlib to the end of the learner connector pipeline. + """Connector added by default by RLlib to the end of any learner connector pipeline. If provided with `episodes` data, this connector piece makes sure that the final train batch going into the RLModule for updating (`forward_train()` call) contains @@ -21,7 +27,7 @@ class DefaultLearnerConnector(ConnectorV2): - Actions, rewards, terminal/truncation flags: From all episodes under the respective keys. - All data inside the episodes' `extra_model_outs` property, e.g. action logp and - action probs. + action probs under the respective keys. - States: If the RLModule is stateful, the episodes' STATE_OUTS will be extracted and restructured under a new STATE_IN key in such a way that the resulting STATE_IN batch has the shape (B', ...). Here, B' is the sum of splits we have to do over @@ -31,23 +37,32 @@ class DefaultLearnerConnector(ConnectorV2): If the user wants to customize their own data under the given keys (e.g. obs, actions, ...), they can extract from the episodes or recompute from `input_` - their own data and store it under those keys (in `input_`). In such a case, this - connector will not touch the data under these keys. + their own data and store it in `input_` under those keys. In this case, the default + connector will not change the data under these keys and simply act as a + pass-through. """ - def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): + def __call__( + self, + input_: Any, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + **kwargs, + ) -> Any: # If episodes are provided, extract the essential data from them, but only if - # this data is not present yet in `input_`. + # respective keys are not present yet in `input_`. if not episodes: return input_ - # Get data dicts for all episodes. + # Get the data dicts for all episodes. data_dicts = [episode.get_data_dict() for episode in episodes] state_in = None T = ctx.rl_module.config.model_config_dict.get("max_seq_len") - # Special handling of STATE_OUT/STATE_IN keys: + # RLModule is stateful and STATE_IN is not found in `input_` (user's custom + # connectors have not provided this information yet) -> Perform separate + # handling of STATE_OUT/STATE_IN keys: if ctx.rl_module.is_stateful() and STATE_IN not in input_: if T is None: raise ValueError( @@ -57,11 +72,17 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): "You can set this dict and/or override keys in it via " "`config.training(model={'max_seq_len': x})`." ) - # Get model init state. - init_state = convert_to_numpy(ctx.rl_module.get_initial_state()) + + # Before adding anything to `input_`, add the time axis to existing data. + input_ = tree.map_structure( + lambda s: split_and_pad_single_record(s, episodes, T=T), + input_, + ) + # Get STATE_OUTs for all episodes and only keep those (as STATE_INs) that # are located at the `max_seq_len` edges (state inputs to RNNs only have a # B-axis, no T-axis). + init_state = convert_to_numpy(ctx.rl_module.get_initial_state()) state_ins = [] for episode, data_dict in zip(episodes, data_dicts): # Remove state outs (should not be part of the T-axis rearrangements). @@ -69,15 +90,16 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): state_ins.append( tree.map_structure( # [::T] = only keep every Tth (max_seq_len) state in. - # [:-1] = shift state outs by one (ignore very last state out, but - # therefore add the init state at the beginning). + # [:-1] = shift state outs by one (ignore very last state out, + # but therefore add the init state at the beginning). lambda i, o: np.concatenate([[i], o[:-1]])[::T], ( # Episode has a (reset) beginning -> Prepend initial state. init_state if episode.t_started == 0 - # Episode starts somewhere in the middle (is a cut continuation - # chunk) -> Use previous chunk's last STATE_OUT as initial state. + # Episode starts somewhere in the middle (is a cut + # continuation chunk) -> Use previous chunk's last STATE_OUT + # as initial state. else episode.get_extra_model_outputs( key=STATE_OUT, indices=-len(episode) - 1 ) @@ -85,21 +107,14 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): state_outs, ) ) - # Concatenate the individual episodes' state ins. + # Concatenate the individual episodes' STATE_INs. state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins) - # Before adding anything else to the `input_`, add the time axis to existing - # data. - input_ = tree.map_structure( - lambda s: split_and_pad_single_record(s, episodes, T=T), - input_, - ) - # Set the reduce function for all the data we might still have to extract # from our list of episodes. This function takes a list of data (e.g. obs) # with each item in the list representing one episode and properly # splits along the time axis and zero-pads if necessary (based on - # max_seq_len). + # T=max_seq_len). reduce_fn = partial(split_and_pad, T=T) # No stateful module, normal batch (w/o T-axis or zero-padding). @@ -109,7 +124,8 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): # episodes along the batch axis (axis=0). reduce_fn = np.concatenate - # Extract all data from the episodes, if not already in `input_`. + # Extract all data from the episodes and add to `input_`, if not already in + # `input_`. for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, @@ -126,18 +142,19 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): *[d[key] for d in data_dicts], ) - # Infos (always as lists). - # TODO:uncomment if SampleBatch.INFOS not in input_: - # input_[SampleBatch.INFOS] = sum( - # [d[SampleBatch.INFOS] for d in data_dicts], - # [], - # ) + # Handle infos (always lists, not numpy arrays). + if SampleBatch.INFOS not in input_: + input_[SampleBatch.INFOS] = sum( + [d[SampleBatch.INFOS] for d in data_dicts], + [], + ) + # Now that all "normal" fields are time-dim'd and zero-padded, add + # the STATE_IN column to `input_`. if ctx.rl_module.is_stateful(): - # Now that all "normal" fields are time-dim'd and zero-padded, add - # the STATE_IN column to `input_`. input_[STATE_IN] = state_in - # Create the zero-padding loss mask. + # Also, create the loss mask (b/c of our now possibly zero-padded data) as + # well as the seq_lens array and add these to `input_` as well. ( input_["loss_mask"], input_[SampleBatch.SEQ_LENS], @@ -147,66 +164,3 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2, **kwargs): ) return input_ - - -def split_and_pad(episodes_data, T): - all_chunks = [] - - for data in episodes_data: - num_chunks = int(np.ceil(data.shape[0] / T)) - - for i in range(num_chunks): - start_index = i * T - end_index = start_index + T - - # Extract the chunk - chunk = data[start_index:end_index] - - # Pad the chunk if it's shorter than T - if chunk.shape[0] < T: - padding_shape = [(0, T - chunk.shape[0])] + [ - (0, 0) for _ in range(chunk.ndim - 1) - ] - chunk = np.pad(chunk, pad_width=padding_shape, mode="constant") - - all_chunks.append(chunk) - - # Combine all chunks into a single array - result = np.concatenate(all_chunks, axis=0) - - # Reshape the array to include the time dimension T - # The new shape should be (-1, T) + original dimensions (excluding the batch dimension) - result = result.reshape((-1, T) + result.shape[1:]) - - return result - - -def split_and_pad_single_record(data, episodes, T): - episodes_data = [] - idx = 0 - for episode in episodes: - len_ = len(episode) - episodes_data.append(data[idx : idx + len_]) - idx += len_ - return split_and_pad(episodes_data, T) - - -def create_mask_and_seq_lens(episode_lens, T): - mask = [] - seq_lens = [] - for episode_len in episode_lens: - len_ = min(episode_len, T) - seq_lens.append(len_) - row = [1] * len_ + [0] * (T - len_) - mask.append(row) - - # Handle sequence lengths greater than T. - overflow = episode_len - T - while overflow > 0: - len_ = min(overflow, T) - seq_lens.append(len_) - extra_row = [1] * len_ + [0] * (T - len_) - mask.append(extra_row) - overflow -= T - - return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32) diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py index 5bf0a2af0c8a4..b3b8f8e181b1a 100644 --- a/rllib/connectors/module_to_env/default_module_to_env.py +++ b/rllib/connectors/module_to_env/default_module_to_env.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, List import numpy as np import tree # pip install dm_tree @@ -8,24 +8,29 @@ from ray.rllib.core.models.base import STATE_OUT from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType from ray.util.annotations import PublicAPI @PublicAPI(stability="alpha") class DefaultModuleToEnv(ConnectorV2): - """A connector that samples actions given action dist. inputs and a dist. class. + """Default connector piece added by RLlib to the end of any module-to-env pipeline. - The connector will only sample from the distribution, if the ACTIONS key - cannot be found in the connector's input. Otherwise, it'll behave simply as pass - through (noop). If ACTIONS is not present, but ACTION_DIST_INPUTS are, will create - a distribution from the RLModule and sample from it (deterministically, if - we are not exploring, stochastically, if we are). + If necessary, this connector samples actions, given action dist. inputs and a + dist. class. + The connector will only sample from the action distribution, if the + SampleBatch.ACTIONS key cannot be found in `input_`. Otherwise, it'll behave + as pass through (noop). If SampleBatch.ACTIONS is not present, but + SampleBatch.ACTION_DIST_INPUTS are, the connector will create a new action + distribution using the RLModule in the connector context and sample from this + distribution (deterministically, if we are not exploring, stochastically, if we + are). input_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA Operates per RLModule as it will have to pull the action distribution from each in order to sample actions if necessary. Searches for the ACTIONS and - ACTION_DIST_INPUTS keys in a module's outputs and - should ACTIONS not be found - - sample actions from the module's action distribution. + ACTION_DIST_INPUTS keys in a module's outputs and - should ACTIONS not be + found - sample actions from the module's action distribution. output_type: INPUT_OUTPUT_TYPES.DICT_OF_MODULE_IDS_TO_DATA (same as input: data in, data out, however, data out might contain an additional ACTIONS key if it was not previously present @@ -33,8 +38,12 @@ class DefaultModuleToEnv(ConnectorV2): """ @override(ConnectorV2) - def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any: - + def __call__( + self, + input_: Any, + episodes: List[EpisodeType], + ctx: ConnectorContextV2, + ) -> Any: # Loop through all modules that created some output. # for mid in input_.keys(): # sa_module = ctx.rl_module.get_module(module_id=mid) @@ -47,20 +56,20 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any: if state: input_[STATE_OUT] = state - # ACTION_DIST_INPUTS field returned by `forward_exploration()` -> - # Create a distribution object. + # ACTION_DIST_INPUTS field returned by `forward_exploration|inference()` -> + # Create a new action distribution object. action_dist = None - # The RLModule has already computed actions. - if ( - SampleBatch.ACTION_DIST_INPUTS in input_ - and SampleBatch.ACTION_LOGP not in input_ - ): - dist_inputs = input_[SampleBatch.ACTION_DIST_INPUTS] + if SampleBatch.ACTION_DIST_INPUTS in input_: if ctx.explore: action_dist_class = ctx.rl_module.get_exploration_action_dist_cls() else: action_dist_class = ctx.rl_module.get_inference_action_dist_cls() - action_dist = action_dist_class.from_logits(dist_inputs) + action_dist = action_dist_class.from_logits( + input_[SampleBatch.ACTION_DIST_INPUTS] + ) + + # TODO (sven): Should this not already be taken care of by RLModule's + # `get_...action_dist_cls()` methods? if not ctx.explore: action_dist = action_dist.to_deterministic() @@ -71,25 +80,16 @@ def __call__(self, input_: Any, episodes, ctx: ConnectorContextV2) -> Any: else: if action_dist is None: raise KeyError( - "Your RLModule's `forward_[explore|inference]()` methods must " - f"return a dict with either the {SampleBatch.ACTIONS} key or " - f"the {SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" + "Your RLModule's `forward_[exploration|inference]()` methods must " + f"return a dict with either the '{SampleBatch.ACTIONS}' key or " + f"the '{SampleBatch.ACTION_DIST_INPUTS}' key in it (or both)!" ) actions = action_dist.sample() input_[SampleBatch.ACTIONS] = actions - # Compute action-logp and action-prob from distribution and add to - # output, if possible. + # For convenience and if possible, compute action logp from distribution + # and add to output. if action_dist is not None and SampleBatch.ACTION_LOGP not in input_: input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions) return input_ - - # @override(Connector) - # def serialize(self): - # return ClipActions.__name__, None - - # @staticmethod - # TODO - # def from_state(ctx: ConnectorContext, params: Any): - # return ClipActions(ctx) diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py new file mode 100644 index 0000000000000..9b4685db8cfb8 --- /dev/null +++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py @@ -0,0 +1,27 @@ +from typing import Any, List, Optional + +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 +from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv + + +class ModuleToEnvPipeline(ConnectorPipelineV2): + """The superclass for any module-to-env pipelines.""" + def __init__( + self, + *, + ctx: ConnectorContextV2, + connectors: Optional[List[ConnectorV2]] = None, + **kwargs, + ): + super().__init__(ctx=ctx, connectors=connectors, **kwargs) + + # Add the default final connector piece for env-to-module pipelines: + # Sampling actions from action_dist_inputs and add them to input, iff this has + # not happened in any connector piece in this pipeline before. + if ( + len(self.connectors) == 0 + or type(self.connectors[-1]) is not DefaultModuleToEnv + ): + self.append(DefaultModuleToEnv(ctx=ctx)) diff --git a/rllib/connectors/utils/__init__.py b/rllib/connectors/utils/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/connectors/utils/zero_padding.py b/rllib/connectors/utils/zero_padding.py new file mode 100644 index 0000000000000..e34c0eab85cc5 --- /dev/null +++ b/rllib/connectors/utils/zero_padding.py @@ -0,0 +1,135 @@ +from typing import List, Tuple + +import numpy as np + + +def create_mask_and_seq_lens( + episode_lens: List[int], + T: int, +) -> Tuple[np._typing.NDArray, np._typing.NDArray]: + """Creates loss mask and a seq_lens array, given a list of episode lengths and T. + + Args: + episode_lens: A list of episode lengths to infer the loss mask and seq_lens + array from. + T: The maximum number of timesteps in each "row", also known as the maximum + sequence length (max_seq_len). Episodes are split into chunks that are at + most `T` long and remaining timesteps will be zero-padded (and masked out). + + Returns: + Tuple consisting of a) the loss mask to use (masking out areas that are past + the end of an episode (or rollout), but had to be zero-added due to the added + extra time rank (of length T) and b) the array of sequence lengths resulting + from splitting the given episodes into chunks of at most `T` timesteps. + """ + mask = [] + seq_lens = [] + for episode_len in episode_lens: + len_ = min(episode_len, T) + seq_lens.append(len_) + row = [1] * len_ + [0] * (T - len_) + mask.append(row) + + # Handle sequence lengths greater than T. + overflow = episode_len - T + while overflow > 0: + len_ = min(overflow, T) + seq_lens.append(len_) + extra_row = [1] * len_ + [0] * (T - len_) + mask.append(extra_row) + overflow -= T + + return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32) + + +def split_and_pad(data_chunks: List[np._typing.NDArray], T: int) -> np._typing.NDArray: + """Splits and zero-pads data from episodes into a single ndarray with a fixed T-axis. + + Processes each data chunk in `data_chunks`, coming from one episode by splitting + the chunk into smaller sub-chunks, each of a maximum size `T`. If a sub-chunk is + smaller than `T`, it is right-padded with zeros to match the desired size T. + All sub-chunks are then re-combined (concatenated) into a single ndarray, which is + reshaped to include the new time dimension `T` as axis 1 (axis 0 is the batch + axis). The resulting output array has dimensions (B=number of sub-chunks, T, ...), + where '...' represents the original dimensions of the input data (excluding the + batch dimension). + + Args: + data_chunks: A list where each element is a NumPy array representing + an episode. Each array's shape should be (episode_length, ...) + where '...' represents any number of additional dimensions. + T: The desired time dimension size for each chunk. + + Returns: + A np.ndarray containing the reshaped and padded chunks. The shape of the + array will be (B, T, ...) where B is automatically determined by the number + of chunks in `data_chunks` and `T`. + '...' represents the original dimensions of the input data, excluding the + batch dimension. + """ + all_chunks = [] + + for data_chunk in data_chunks: + num_sub_chunks = int(np.ceil(data_chunk.shape[0] / T)) + + for i in range(num_sub_chunks): + start_index = i * T + end_index = start_index + T + + # Extract the chunk. + sub_chunk = data_chunk[start_index:end_index] + + # Pad the chunk if it's shorter than T + if sub_chunk.shape[0] < T: + padding_shape = [(0, T - sub_chunk.shape[0])] + [ + (0, 0) for _ in range(sub_chunk.ndim - 1) + ] + sub_chunk = np.pad(sub_chunk, pad_width=padding_shape, mode="constant") + + all_chunks.append(sub_chunk) + + # Combine all chunks into a single array. + result = np.concatenate(all_chunks, axis=0) + + # Reshape the array to include the time dimension T. + # The new shape should be (-1, T) + original dimensions (excluding the + # batch dimension). + result = result.reshape((-1, T) + result.shape[1:]) + + return result + + +def split_and_pad_single_record( + data: np._typing.NDArray, episode_lengths: List[int], T: int +): + """See `split_and_pad`, but initial data has already been concatenated over episodes. + + Given an np.ndarray of data that is the result of a concatenation of data chunks + coming from different episodes, the lengths of these episodes, as well as the + maximum time dimension, split and possibly right-zero-pad this input data, such that + the resulting shape of the returned np.ndarray is (B', T, ...), where B' is the + number of generated sub-chunks and ... is the original shape of the data (excluding + the batch dim). T is the size of the newly inserted time axis (on which zero-padding + is applied if necessary). + + Args: + data: The single np.ndarray input data to be split, zero-added, and reshaped. + episode_lengths: The list of episode lengths, from which `data` was originally + concat'd. + T: The maximum number of timesteps on the T-axis in the resulting np.ndarray. + + Returns: + A single np.ndarray, which contains the same data as `data`, but split into sub- + chunks of max. size T (zero-padded if necessary at the end of individual + episodes), then reshaped to (B', T, ...). + """ + # Chop up `data` into chunks of max len=T, based on the lengths of the episodes + # where this data came from. + episodes_data = [] + idx = 0 + for episode_len in episode_lengths: + episodes_data.append(data[idx : idx + episode_len]) + idx += episode_len + # Send everything through `split_and_pad` to perform the actual splitting into + # sub-chunks of max len=T and zero-padding. + return split_and_pad(episodes_data, T) From d3dca2f5ec7e018ba57ff4d98c7c246df6904a27 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 17 Nov 2023 12:29:30 +0100 Subject: [PATCH 03/15] wip Signed-off-by: sven1977 --- rllib/BUILD | 16 +- rllib/connectors/connector_pipeline_v2.py | 207 +++++++++++------- .../env_to_module/env_to_module_pipeline.py | 6 +- .../learner/learner_connector_pipeline.py | 32 +++ 4 files changed, 177 insertions(+), 84 deletions(-) create mode 100644 rllib/connectors/learner/learner_connector_pipeline.py diff --git a/rllib/BUILD b/rllib/BUILD index d66ee968470d1..a34b26ce1a7b9 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -696,7 +696,7 @@ py_test( # -------------------------------------------------------------------- -# Connector tests +# Connector(V1) tests # rllib/connector/ # # Tag: connector @@ -723,6 +723,20 @@ py_test( srcs = ["connectors/tests/test_agent.py"] ) +# -------------------------------------------------------------------- +# ConnectorV2 tests +# rllib/connector/ +# +# Tag: connector_v2 +# -------------------------------------------------------------------- + +py_test( + name = "connectors/tests/test_connector_v2", + tags = ["team:rllib", "connector_v2"], + size = "small", + srcs = ["connectors/tests/test_connector_v2.py"] +) + # -------------------------------------------------------------------- # Env tests # rllib/env/ diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index f3d9c36508682..7ae84d0b08c8f 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -40,7 +40,13 @@ def __call__( ctx: ConnectorContextV2, **kwargs, ) -> Any: - """""" + """In a pipeline, we simply call each of our connector pieces after each other. + + Each connector piece receives as input the output of the previous connector + piece in the pipeline. + """ + # Loop through connector pieces and call each one with the output of the + # previous one. Thereby, time each connector piece's call. ret = input_ for connector in self.connectors: timer = self.timers[str(connector)] @@ -66,55 +72,92 @@ def remove(self, name_or_class: Union[str, Type]): else: logger.warning(f"Trying to remove a non-existent connector {name}.") - def insert_before(self, name_or_class: Union[str, Type], connector: ConnectorV2): - """Insert a new connector before connector + def insert_before( + self, + name_or_class: Union[str, Type], + connector: ConnectorV2, + ) -> ConnectorV2: + """Insert a new connector piece before an existing piece (by name or class). Args: - name: name of the connector before which a new connector + name_or_class: Name or class of the connector piece before which `connector` will get inserted. - connector: a new connector to be inserted. + connector: The new connector piece to be inserted. + + Returns: + The ConnectorV2 before which `connector` has been inserted. """ idx = -1 for idx, c in enumerate(self.connectors): - if c.__class__.__name__ == name: + if ( + ( + isinstance(name_or_class, str) + and c.__class__.__name__ == name_or_class + ) + or (isinstance(name_or_class, type) and c.__class__ is name_or_class) + ): break if idx < 0: - raise ValueError(f"Can not find connector {name}") + raise ValueError( + f"Can not find connector with name or type '{name_or_class}'!" + ) + next_connector = self.connectors[idx] + self.connectors.insert(idx, connector) self._fix_input_output_types() logger.info( - f"Inserted {connector.__class__.__name__} before {name} " + f"Inserted {connector.__class__.__name__} before {name_or_class} " f"to {self.__class__.__name__}." ) + return next_connector - def insert_after(self, name_or_class: Union[str, Type], connector: ConnectorV2): - """Insert a new connector after connector + def insert_after( + self, + name_or_class: Union[str, Type], + connector: ConnectorV2, + ) -> ConnectorV2: + """Insert a new connector piece after an existing piece (by name or class). Args: - name: name of the connector after which a new connector + name_or_class: Name or class of the connector piece after which `connector` will get inserted. - connector: a new connector to be inserted. + connector: The new connector piece to be inserted. + + Returns: + The ConnectorV2 after which `connector` has been inserted. """ idx = -1 for idx, c in enumerate(self.connectors): - if c.__class__.__name__ == name: + if ( + ( + isinstance(name_or_class, str) + and c.__class__.__name__ == name_or_class + ) + or (isinstance(name_or_class, type) and c.__class__ is name_or_class) + ): break if idx < 0: - raise ValueError(f"Can not find connector {name}") + raise ValueError( + f"Can not find connector with name or type '{name_or_class}'!" + ) + prev_connector = self.connectors[idx] + self.connectors.insert(idx + 1, connector) self._fix_input_output_types() logger.info( - f"Inserted {connector.__class__.__name__} after {name} " + f"Inserted {connector.__class__.__name__} after {name_or_class} " f"to {self.__class__.__name__}." ) - def prepend(self, connector: ConnectorV2): - """Append a new connector at the beginning of a connector pipeline. + return prev_connector + + def prepend(self, connector: ConnectorV2) -> None: + """Prepend a new connector at the beginning of a connector pipeline. Args: - connector: a new connector to be appended. + connector: The new connector piece to be prepended to this pipeline. """ self.connectors.insert(0, connector) self._fix_input_output_types() @@ -124,11 +167,11 @@ def prepend(self, connector: ConnectorV2): f"{self.__class__.__name__}." ) - def append(self, connector: ConnectorV2): + def append(self, connector: ConnectorV2) -> None: """Append a new connector at the end of a connector pipeline. Args: - connector: a new connector to be appended. + connector: The new connector piece to be appended to this pipeline. """ self.connectors.append(connector) self._fix_input_output_types() @@ -138,34 +181,30 @@ def append(self, connector: ConnectorV2): f"{self.__class__.__name__}." ) - # @override(ConnectorV2) - # def serialize(self): - # children = [] - # for c in self.connectors: - # state = c.serialize() - # assert isinstance(state, tuple) and len(state) == 2, ( - # "Serialized connector state must be in the format of " - # f"Tuple[name: str, params: Any]. Instead we got {state}" - # f"for connector {c.__name__}." - # ) - # children.append(state) - # return ConnectorPipelineV2.__name__, children - # - # @override(ConnectorV2) - # @staticmethod - # def from_state(ctx: ConnectorContextV2, params: List[Any]): - # assert ( - # type(params) == list - # ), "AgentConnectorPipeline takes a list of connector params." - # connectors = [] - # for state in params: - # try: - # name, subparams = state - # connectors.append(get_connector(name, ctx, subparams)) - # except Exception as e: - # logger.error(f"Failed to de-serialize connector state: {state}") - # raise e - # return ConnectorPipelineV2(ctx, connectors) + @override(ConnectorV2) + def get_state(self): + children = [] + for c in self.connectors: + state = c.serialize() + assert isinstance(state, tuple) and len(state) == 2, ( + "Serialized connector state must be in the format of " + f"Tuple[name: str, params: Any]. Instead we got {state}" + f"for connector {c.__name__}." + ) + children.append(state) + return ConnectorPipelineV2.__name__, children + + @override(ConnectorV2) + def set_state(self, state: Dict[str, Any]): + connectors = [] + for state in params: + try: + name, subparams = state + connectors.append(get_connector(name, ctx, subparams)) + except Exception as e: + logger.error(f"Failed to de-serialize connector state: {state}") + raise e + return ConnectorPipelineV2(ctx, connectors) def __str__(self, indentation: int = 0): return "\n".join( @@ -173,44 +212,50 @@ def __str__(self, indentation: int = 0): + [c.__str__(indentation + 4) for c in self.connectors] ) - def __getitem__(self, key: Union[str, int, type]): - """Returns a list of connectors that fit 'key'. + def __getitem__( + self, + key: Union[str, int, Type], + ) -> Union[ConnectorV2, List[ConnectorV2]]: + """Returns a single ConnectorV2 or list of ConnectorV2s that fit `key`. - If key is a number n, we return a list with the nth element of this pipeline. - If key is a Connector class or a string matching the class name of a - Connector class, we return a list of all connectors in this pipeline matching - the specified class. + If key is an int, we return a single ConnectorV2 at that index in this pipeline. + If key is a ConnectorV2 type or a string matching the class name of a + ConnectorV2 in this pipeline, we return a list of all ConnectorV2s in this + pipeline matching the specified class. Args: - key: The key to index by. + key: The key to find or to index by. - Returns: The Connector at index `key`. + Returns: + A single ConnectorV2 or a list of ConnectorV2s matching `key`. """ - # In case key is a class - if not isinstance(key, str): - if isinstance(key, slice): - raise NotImplementedError( - "Slicing of ConnectorPipeline is currently not supported." - ) - elif isinstance(key, int): - return [self.connectors[key]] - elif isinstance(key, type): - results = [] - for c in self.connectors: - if issubclass(c.__class__, key): - results.append(c) - return results - else: - raise NotImplementedError( - "Indexing by {} is currently not supported.".format(type(key)) - ) - - results = [] - for c in self.connectors: - if c.__class__.__name__ == key: - results.append(c) - - return results + # Key is an int -> Index into pipeline and return. + if isinstance(key, int): + return self.connectors[key] + # Key is a class. + elif isinstance(key, type): + results = [] + for c in self.connectors: + if issubclass(c.__class__, key): + results.append(c) + return results + # Key is a string -> Find connector(s) by name. + elif isinstance(key, str): + results = [] + for c in self.connectors: + if c.name == key: + results.append(c) + return results + # Slicing not supported (yet). + elif isinstance(key, slice): + raise NotImplementedError( + "Slicing of ConnectorPipelineV2 is currently not supported!" + ) + else: + raise NotImplementedError( + f"Indexing ConnectorPipelineV2 by {type(key)} is currently not " + f"supported!" + ) def _fix_input_output_types(self): if len(self.connectors) > 0: diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index 63630229e57bc..6d03242a8a38f 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -35,9 +35,11 @@ def __call__( ctx: ConnectorContextV2, **kwargs, ) -> Any: - # Make sure user does not have to send initial input into this pipeline. - # Might just be empty and to be populated from `episodes`. + return super().__call__( + # Make sure user does not have to send initial `input_` into this env-to-module + # pipeline. This would be the expected behavior b/c after calling the env, + # we don't have any data dict yet, only a list of Episode objects. input_=input_ or {}, episodes=episodes, ctx=ctx, diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py new file mode 100644 index 0000000000000..dce1180516d7a --- /dev/null +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -0,0 +1,32 @@ +from typing import Any, List, Optional + +from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 +from ray.rllib.connectors.learner.default_learner_connector import ( + DefaultLearnerConnector +) + + +class LearnerConnectorPipeline(ConnectorPipelineV2): + """The superclass for any module-to-env pipelines.""" + def __init__( + self, + *, + ctx: ConnectorContextV2, + connectors: Optional[List[ConnectorV2]] = None, + **kwargs, + ): + super().__init__(ctx=ctx, connectors=connectors, **kwargs) + + # Add the default final connector piece for learner pipelines: + # Makes sure observations from episodes are in the train batch as well as + # the correct state inputs in case the RLModule is stateful. In the latter case, + # also takes care of the time rank and zero padding. + if ( + len(self.connectors) == 0 + or type(self.connectors[-1]) is not DefaultLearnerConnector + ): + # Append default learner connector piece at the end. + self.append(DefaultLearnerConnector(ctx=ctx)) + From b0b3c377def4536db32b6bd2826210e4f79bd449 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 14 Dec 2023 12:28:00 +0100 Subject: [PATCH 04/15] LINT Signed-off-by: sven1977 --- rllib/connectors/connector_pipeline_v2.py | 16 ++++------------ .../env_to_module/env_to_module_pipeline.py | 2 -- .../learner/learner_connector_pipeline.py | 4 ++-- .../module_to_env/module_to_env_pipeline.py | 1 + 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 7ae84d0b08c8f..d53708b190eac 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -90,12 +90,8 @@ def insert_before( idx = -1 for idx, c in enumerate(self.connectors): if ( - ( - isinstance(name_or_class, str) - and c.__class__.__name__ == name_or_class - ) - or (isinstance(name_or_class, type) and c.__class__ is name_or_class) - ): + isinstance(name_or_class, str) and c.__class__.__name__ == name_or_class + ) or (isinstance(name_or_class, type) and c.__class__ is name_or_class): break if idx < 0: raise ValueError( @@ -130,12 +126,8 @@ def insert_after( idx = -1 for idx, c in enumerate(self.connectors): if ( - ( - isinstance(name_or_class, str) - and c.__class__.__name__ == name_or_class - ) - or (isinstance(name_or_class, type) and c.__class__ is name_or_class) - ): + isinstance(name_or_class, str) and c.__class__.__name__ == name_or_class + ) or (isinstance(name_or_class, type) and c.__class__ is name_or_class): break if idx < 0: raise ValueError( diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index 6d03242a8a38f..3b985d3944886 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -45,5 +45,3 @@ def __call__( ctx=ctx, **kwargs, ) - - diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py index dce1180516d7a..5725f2a7a252e 100644 --- a/rllib/connectors/learner/learner_connector_pipeline.py +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -4,12 +4,13 @@ from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 from ray.rllib.connectors.learner.default_learner_connector import ( - DefaultLearnerConnector + DefaultLearnerConnector, ) class LearnerConnectorPipeline(ConnectorPipelineV2): """The superclass for any module-to-env pipelines.""" + def __init__( self, *, @@ -29,4 +30,3 @@ def __init__( ): # Append default learner connector piece at the end. self.append(DefaultLearnerConnector(ctx=ctx)) - diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py index 9b4685db8cfb8..b1b3be1d35b48 100644 --- a/rllib/connectors/module_to_env/module_to_env_pipeline.py +++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py @@ -8,6 +8,7 @@ class ModuleToEnvPipeline(ConnectorPipelineV2): """The superclass for any module-to-env pipelines.""" + def __init__( self, *, From 4df7dfef805e86f11a2da1b539ce7420f44f76fe Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 14 Dec 2023 13:14:25 +0100 Subject: [PATCH 05/15] wip Signed-off-by: sven1977 --- rllib/connectors/connector_context_v2.py | 70 --------- rllib/connectors/connector_pipeline_v2.py | 10 +- rllib/connectors/connector_v2.py | 119 ++++++++++++---- .../env_to_module/default_env_to_module.py | 21 +-- .../env_to_module/prev_action_prev_reward.py | 133 ++++++++++++++++++ .../learner/default_learner_connector.py | 105 +++++++++++--- .../module_to_env/default_module_to_env.py | 30 ++-- 7 files changed, 348 insertions(+), 140 deletions(-) delete mode 100644 rllib/connectors/connector_context_v2.py create mode 100644 rllib/connectors/env_to_module/prev_action_prev_reward.py diff --git a/rllib/connectors/connector_context_v2.py b/rllib/connectors/connector_context_v2.py deleted file mode 100644 index fff114618dd15..0000000000000 --- a/rllib/connectors/connector_context_v2.py +++ /dev/null @@ -1,70 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Optional - -from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.utils.typing import AgentID, EnvType -from ray.util.annotations import PublicAPI - - -@PublicAPI(stability="alpha") -@dataclass -class ConnectorContextV2: - """Information needed by pieces of connector pipeline to communicate with each other. - - ConnectorContextV2 will be passed to each connector (pipeline) call. - Also might contain references to the RLModule used, the Env, as well as whether - `explore` is True or False (whether forward_exploration or forward_inference was - used). - - TODO: Describe use cases, e.g. - - state out need to be fed back as state ins. - Unless we would like to temporarily store the states in the episode. - - agent_to_policy_mappings need to be stored as they might be stochastic. Then the - to_env pipeline can properly map back from module (formerly known as policy) IDs - to agent IDs. - - Attributes: - env: The Env object used to reset/step through in the current Env->Module - setup. This will be None in contexts used in a Learner connector pipeline. - rl_module: The RLModule used for either action computing forward passes - (`forward_exploration|inference()`) in the current Env->Module setup - or `forward_train()` calls in a Learner connector pipeline. - explore: Whether `explore` is currently on. Per convention, if True, the - RLModule's `forward_exploration()` method should be called, if False, the - EnvRunner should call `forward_inference()` instead. Should be None inside - Learner connector pipelines. - agent_id: The (optional) current agent ID that the connector should be - creating/extracting data for. - episode_index: The (optional) index within the list of SingleAgentEpisodes or - MultiAgentEpisodes, which each connector is given in a call, that belongs - to the given agent_id. - data: Optional additional context data that needs to be exchanged between - different Connector pieces and -pipelines. - - TODO (sven): Maybe we should have to AlgorithmConfig here as well. - """ - - env: Optional[EnvType] = None - rl_module: Optional[RLModule] = None - explore: Optional[bool] = None - data: Optional[Any] = None - - # TODO (sven): Do these have to be here?? - agent_id: Optional[AgentID] = None - episode_index: Optional[int] = None - - def add_data(self, key, value): - assert key not in self.data - self.data[key] = value - - def get_data(self, key): - assert key in self.data - return self.data[key] - - def override_data(self, key, value): - assert key in self.data - self.data[key] = value - - def del_data(self, key): - assert key in self.data - del self.data[key] diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index d53708b190eac..6e45a5792c695 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -1,12 +1,16 @@ from collections import defaultdict import logging -from typing import Any, List, Optional, Type, Union +from typing import Any, List, Optional, Union + +import gymnasium as gym from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv -from ray.rllib.utils.annotations import override +from ray.rllib.connectors.learner.default_learner_connector import ( + DefaultLearnerConnector +) +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.typing import EpisodeType from ray.util.annotations import PublicAPI from ray.util.timer import _Timer diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index 0c80ab64d2228..b201c804ca2d0 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -1,8 +1,10 @@ import abc -from typing import Any, Dict, List, Tuple +from typing import Any, List, Optional + +import gymnasium as gym -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.connectors.input_output_types import INPUT_OUTPUT_TYPES +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.typing import EpisodeType from ray.util.annotations import PublicAPI @@ -11,37 +13,44 @@ class ConnectorV2(abc.ABC): """Base class defining the API for an individual "connector piece". - A ConnectorV2 ("connector piece") is usually part of a series of pieces within - a "connector pipeline", which in itself also abides to this very API. + A ConnectorV2 ("connector piece") is usually part of a whole series of connector + pieces within a so-called connector pipeline, which in itself also abides to this + very API.. For example, you might have a connector pipeline consisting of two connector pieces, A and B, both instances of subclasses of ConnectorV2 and each one performing a particular transformation on their input data. The resulting connector pipeline (A->B) itself also abides to this very ConnectorV2 API and could thus be part of yet another, higher-level connector pipeline. - Any ConnectorV2 instances (individual pieces or several connector pieces in a - pipeline) must be callable by overriding their `__call__()` method. When called, - they take the outputs of a previous connector piece (or an empty dict if there are - no previous pieces) as well as all the data collected thus far in the ongoing - episode(s) (only applies to connectors used in EnvRunners) or retrieved from a - replay buffer or from an environment sampling step (only applies to connectors used - in Learner pipelines). From this data (previous piece's output and possibly - episodes), a ConnectorV2 then performs a transformation step. + Any ConnectorV2 instance (individual pieces or several connector pieces in a + pipeline) is a callable and you should override their `__call__()` method. + When called, they take the outputs of a previous connector piece (or an empty dict + if there are no previous pieces) as well as all the data collected thus far in the + ongoing episode(s) (only applies to connectors used in EnvRunners) or retrieved + from a replay buffer or from an environment sampling step (only applies to + connectors used in Learner pipelines). From this input data, a ConnectorV2 then + performs a transformation step. There are 3 types of pipelines a ConnectorV2 can belong to: 1) env-to-module: The connector transforms envrionment data before it gets to the - RLModule. + RLModule. This type of pipeline is used by an EnvRunner for transforming + env output data to RLModule readable data (for the next RLModule forward pass). 2) module-to-env: The connector transforms RLModule outputs before they are sent - back to the environment (as actions). + back to the environment (as actions). This type of pipeline is used by an EnvRunner + to transform RLModule output data to env readable actions (for the next + `env.step()` call). 3) learner pipeline: The connector transforms data coming directly from an environment sampling step or a replay buffer and will be sent into the RLModule's - `forward_train()` method afterwards to compute the loss inputs. + `forward_train()` method afterwards to compute the loss inputs. This type of + pipeline is used by a Learner to transform raw training data (a batch or a list of + episodes) to RLModule readable training data (for the next RLModule + `forward_train()` call). Some connectors might be stateful, for example for keeping track of observation - filtering stats (mean and stddev values). States of all connectors and connector - pipelines are frequently being synchronized between the EnvRunners (owning the - env-to-module and module-to-env pipelines) and the Learners (owning the Learner - pipelines). + filtering stats (mean and stddev values). Any Algorithm, which uses connectors is + responsible for frequenly synchronizing the states of all connectors and connector + pipelines between the EnvRunners (owning the env-to-module and module-to-env + pipelines) and the Learners (owning the Learner pipelines). """ # Set these in ALL subclasses. @@ -51,22 +60,66 @@ class ConnectorV2(abc.ABC): input_type = INPUT_OUTPUT_TYPES.DATA output_type = INPUT_OUTPUT_TYPES.DATA - def __init__(self, *, ctx: ConnectorContextV2, **kwargs): + @property + def observation_space(self): + return self.input_observation_space + + @observation_space.setter + def observation_space(self, value): + self.observation_space = value + + @property + def action_space(self): + return self.input_action_space + + @action_space.setter + def action_space(self, value): + self.action_space = value + + def __init__( + self, + *, + input_observation_space: Optional[gym.Space], + input_action_space: Optional[gym.Space], + env: Optional[gym.Env] = None, + #rl_module: Optional["RLModule"] = None, + **kwargs, + ): """Initializes a ConnectorV2 instance. Args: - ctx: The initial ConnectorContextV2. + env: An optional env object that the connector might need to know about. + Note that normally, env-to-module and module-to-env connectors get this + information at construction time, but learner connectors won't (b/c + Learner objects don't carry an environment object). + input_observation_space: The (mandatory) input observation space. This + is the space coming from a previous connector piece in the + (env-to-module or learner) pipeline or it is directly defined within + the used gym.Env. + input_action_space: The (mandatory) input action space. This + is the space coming from a previous connector piece in the + (module-to-env) pipeline or it is directly defined within the used + gym.Env. + #rl_module: An optional RLModule object that the connector might need to know + # about. Note that normally, only module-to-env connectors get this + # information at construction time, but env-to-module and learner + # connectors won't (b/c they get constructed before the RLModule). **kwargs: Forward API-compatibility kwargs. """ - self.ctx = ctx + self.input_observation_space = input_observation_space + self.input_action_space = input_action_space + self.env = env + #self.rl_module = rl_module @abc.abstractmethod def __call__( self, *, + rl_module: RLModule, input_: Any, episodes: List[EpisodeType], - ctx: ConnectorContextV2, + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, **kwargs, ) -> Any: """Method for transforming input data into output data. @@ -74,14 +127,20 @@ def __call__( Args: input_: The input data abiding to `self.input_type` to be transformed by this connector. Transformations might either be done in-place or a new - structure may be returned. The returned data must match - `self.output_type`. + structure may be returned that matches `self.output_type`. episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects, - each corresponding to one slot in a gym.vector.Env. - ctx: The ConnectorContextV2, containing the current Env, RLModule, and other - context-relevant information. It can also be used to pass along - information between connector pieces (even across different pipelines). - **kwargs: Forward API-compatibility kwargs. + each corresponding to one slot in the vector env. Note that episodes + should always be considered read-only and not be altered. + rl_module: An optional RLModule object that the connector might need to know + about. Note that normally, only module-to-env connectors get this + information at construction time, but env-to-module and learner + connectors won't (b/c they get constructed before the RLModule). + explore: Whether `explore` is currently on. Per convention, if True, the + RLModule's `forward_exploration` method should be called, if False, the + EnvRunner should call `forward_inference` instead. + persistent_data: Optional additional context data that needs to be exchanged + between different Connector pieces and -pipelines. + kwargs: Forward API-compatibility kwargs. Returns: The transformed connector output abiding to `self.output_type`. diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py index 9d7616011b8c7..8239b5f2c2ebd 100644 --- a/rllib/connectors/env_to_module/default_env_to_module.py +++ b/rllib/connectors/env_to_module/default_env_to_module.py @@ -1,11 +1,11 @@ -from typing import Any, List +from typing import Any, List, Optional import numpy as np import tree from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.core.models.base import STATE_IN, STATE_OUT +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.spaces.space_utils import batch @@ -32,9 +32,12 @@ class DefaultEnvToModule(ConnectorV2): @override(ConnectorV2) def __call__( self, - input_: Any, + *, + rl_module: RLModule, + input_: Optional[Any] = None, episodes: List[EpisodeType], - ctx: ConnectorContextV2, + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, **kwargs, ) -> Any: # If observations cannot be found in `input`, add the most recent ones (from all @@ -50,10 +53,7 @@ def __call__( # If our module is stateful: # - Add the most recent STATE_OUTs to `input_`. # - Make all data in `input_` have a time rank (T=1). - if ctx.rl_module.is_stateful(): - # Make all other inputs have an additional T=1 axis. - input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_) - + if rl_module.is_stateful(): # Collect all most recently computed STATE_OUT (or use initial states from # RLModule if at beginning of episode). states = [] @@ -64,12 +64,15 @@ def __call__( # TODO (sven): Generalize to MultiAgentEpisodes. # Episode just started -> Get initial state from our RLModule. if len(episode) == 0: - state = ctx.rl_module.get_initial_state() + state = rl_module.get_initial_state() # Episode is already ongoing -> Use most recent STATE_OUT. else: state = episode.extra_model_outputs[STATE_OUT][-1] states.append(state) + # Make all other inputs have an additional T=1 axis. + input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_) + # Batch states (from list of individual vector sub-env states). # Note that state ins should NOT have the extra time dimension. input_[STATE_IN] = batch(states) diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py new file mode 100644 index 0000000000000..cf11edba10298 --- /dev/null +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -0,0 +1,133 @@ +from functools import partial +import numpy as np +from typing import Any, List, Optional + +import gymnasium as gym + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.spaces.space_utils import batch +from ray.rllib.utils.typing import EpisodeType + + +class _PrevRewardPrevActionConnector(ConnectorV2): + """A connector piece that adds previous rewards and actions to the input.""" + + def __init__( + self, + *, + # Base class constructor args. + input_observation_space: Optional[gym.Space], + input_action_space: Optional[gym.Space], + env: Optional[gym.Env] = None, + # Specific prev. r/a args. + n_prev_actions: int = 1, + n_prev_rewards: int = 1, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a PrevRewardPrevActionConnector instance. + + Args: + n_prev_actions: The number of previous actions to include in the output + data. Discrete actions are ont-hot'd. If > 1, will concatenate the + individual action tensors. + n_prev_rewards: The number of previous rewards to include in the output + data. + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to a env-to-module pipeline. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + env=env, + **kwargs, + ) + + self.n_prev_actions = n_prev_actions + self.n_prev_rewards = n_prev_rewards + self.as_learner_connector = as_learner_connector + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + input_: Optional[Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # This is a data-in-data-out connector, so we expect `input_` to be a dict + # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule]. + # We will just extract the most recent rewards and/or most recent actions from + # all episodes and store them inside the `input_` data dict. + + prev_a = [] + prev_r = [] + for episode in episodes: + # TODO (sven): Get rid of this distinction. With the new Episode APIs, + # this should work the same, whether on finalized or non-finalized + # episodes. + # Learner connector pipeline. Episodes have been finalized/numpy'ized. + if self.as_learner_connector: + assert episode.is_finalized + # Loop through each timestep in the episode and add the previous n + # actions and previous m rewards (based on that timestep) to the batch. + for ts in range(len(episode)): + prev_a.append( + episode.get_actions( + # Extract n actions from `ts - n` to `ts` (excluding `ts`). + indices=slice(ts - self.n_prev_actions, ts), + # Make sure negative indices are NOT interpreted as "counting + # from the end", but as absolute indices meaning they refer + # to timesteps before 0 (which is the lookback buffer). + neg_indices_left_of_zero=True, + # In case we are at the very beginning of the episode, e.g. + # ts==0, fill the left side with zero-actions. + fill=0.0, + # Return one-hot arrays for those action components that are + # discrete or multi-discrete. + one_hot_discrete=True, + ) + ) + # Do the same for rewards. + prev_r.append( + episode.get_rewards( + indices=slice(ts - self.n_prev_rewards, ts), + neg_indices_left_of_zero=True, + fill=0.0, + ) + ) + # Env-to-module pipeline. Episodes still operate on lists. + else: + assert not episode.is_finalized + prev_a.append( + batch( + episode.get_actions( + indices=slice(-self.n_prev_actions, None), + fill=0.0, + one_hot_discrete=True, + ) + ) + ) + prev_r.append( + np.array( + episode.get_rewards( + indices=slice(-self.n_prev_rewards, None), + fill=0.0, + ) + ) + ) + + input_[SampleBatch.PREV_ACTIONS] = batch(prev_a) + input_[SampleBatch.PREV_REWARDS] = np.array(prev_r) + return input_ + + +PrevRewardPrevActionEnvToModule = partial( + _PrevRewardPrevActionConnector, as_learner_connector=False +) diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py index 9a636a0fc0d9c..4216f4790b5f3 100644 --- a/rllib/connectors/learner/default_learner_connector.py +++ b/rllib/connectors/learner/default_learner_connector.py @@ -1,18 +1,14 @@ from functools import partial -from typing import Any, List +from typing import Any, List, Optional import numpy as np import tree from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 -from ray.rllib.connectors.utils.zero_padding import ( - create_mask_and_seq_lens, - split_and_pad, - split_and_pad_single_record, -) from ray.rllib.core.models.base import STATE_IN, STATE_OUT +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.typing import EpisodeType @@ -42,11 +38,15 @@ class DefaultLearnerConnector(ConnectorV2): pass-through. """ + @override(ConnectorV2) def __call__( self, + *, + rl_module: RLModule, input_: Any, episodes: List[EpisodeType], - ctx: ConnectorContextV2, + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, **kwargs, ) -> Any: # If episodes are provided, extract the essential data from them, but only if @@ -58,12 +58,12 @@ def __call__( data_dicts = [episode.get_data_dict() for episode in episodes] state_in = None - T = ctx.rl_module.config.model_config_dict.get("max_seq_len") + T = rl_module.config.model_config_dict.get("max_seq_len") # RLModule is stateful and STATE_IN is not found in `input_` (user's custom # connectors have not provided this information yet) -> Perform separate # handling of STATE_OUT/STATE_IN keys: - if ctx.rl_module.is_stateful() and STATE_IN not in input_: + if rl_module.is_stateful() and STATE_IN not in input_: if T is None: raise ValueError( "You are using a stateful RLModule and are not providing custom " @@ -72,17 +72,11 @@ def __call__( "You can set this dict and/or override keys in it via " "`config.training(model={'max_seq_len': x})`." ) - - # Before adding anything to `input_`, add the time axis to existing data. - input_ = tree.map_structure( - lambda s: split_and_pad_single_record(s, episodes, T=T), - input_, - ) - + # Get model init state. + init_state = convert_to_numpy(rl_module.get_initial_state()) # Get STATE_OUTs for all episodes and only keep those (as STATE_INs) that # are located at the `max_seq_len` edges (state inputs to RNNs only have a # B-axis, no T-axis). - init_state = convert_to_numpy(ctx.rl_module.get_initial_state()) state_ins = [] for episode, data_dict in zip(episodes, data_dicts): # Remove state outs (should not be part of the T-axis rearrangements). @@ -101,7 +95,7 @@ def __call__( # continuation chunk) -> Use previous chunk's last STATE_OUT # as initial state. else episode.get_extra_model_outputs( - key=STATE_OUT, indices=-len(episode) - 1 + key=STATE_OUT, indices=-1, neg_indices_left_of_zero=True ) ), state_outs, @@ -110,6 +104,13 @@ def __call__( # Concatenate the individual episodes' STATE_INs. state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins) + # Before adding anything else to the `input_`, add the time axis to existing + # data. + input_ = tree.map_structure( + lambda s: split_and_pad_single_record(s, episodes, T=T), + input_, + ) + # Set the reduce function for all the data we might still have to extract # from our list of episodes. This function takes a list of data (e.g. obs) # with each item in the list representing one episode and properly @@ -151,7 +152,7 @@ def __call__( # Now that all "normal" fields are time-dim'd and zero-padded, add # the STATE_IN column to `input_`. - if ctx.rl_module.is_stateful(): + if rl_module.is_stateful(): input_[STATE_IN] = state_in # Also, create the loss mask (b/c of our now possibly zero-padded data) as # well as the seq_lens array and add these to `input_` as well. @@ -164,3 +165,67 @@ def __call__( ) return input_ + + +def split_and_pad(episodes_data, T): + all_chunks = [] + + for data in episodes_data: + num_chunks = int(np.ceil(data.shape[0] / T)) + + for i in range(num_chunks): + start_index = i * T + end_index = start_index + T + + # Extract the chunk + chunk = data[start_index:end_index] + + # Pad the chunk if it's shorter than T + if chunk.shape[0] < T: + padding_shape = [(0, T - chunk.shape[0])] + [ + (0, 0) for _ in range(chunk.ndim - 1) + ] + chunk = np.pad(chunk, pad_width=padding_shape, mode="constant") + + all_chunks.append(chunk) + + # Combine all chunks into a single array + result = np.concatenate(all_chunks, axis=0) + + # Reshape the array to include the time dimension T. + # The new shape should be (-1, T) + original dimensions (excluding the batch + # dimension) + result = result.reshape((-1, T) + result.shape[1:]) + + return result + + +def split_and_pad_single_record(data, episodes, T): + episodes_data = [] + idx = 0 + for episode in episodes: + len_ = len(episode) + episodes_data.append(data[idx : idx + len_]) + idx += len_ + return split_and_pad(episodes_data, T) + + +def create_mask_and_seq_lens(episode_lens, T): + mask = [] + seq_lens = [] + for episode_len in episode_lens: + len_ = min(episode_len, T) + seq_lens.append(len_) + row = [1] * len_ + [0] * (T - len_) + mask.append(row) + + # Handle sequence lengths greater than T. + overflow = episode_len - T + while overflow > 0: + len_ = min(overflow, T) + seq_lens.append(len_) + extra_row = [1] * len_ + [0] * (T - len_) + mask.append(extra_row) + overflow -= T + + return np.array(mask, dtype=np.bool_), np.array(seq_lens, dtype=np.int32) diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py index b3b8f8e181b1a..395225f5d6a64 100644 --- a/rllib/connectors/module_to_env/default_module_to_env.py +++ b/rllib/connectors/module_to_env/default_module_to_env.py @@ -1,11 +1,11 @@ -from typing import Any, List +from typing import Any, List, Optional import numpy as np import tree # pip install dm_tree from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.core.models.base import STATE_OUT +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import EpisodeType @@ -40,17 +40,22 @@ class DefaultModuleToEnv(ConnectorV2): @override(ConnectorV2) def __call__( self, + *, + rl_module: RLModule, input_: Any, episodes: List[EpisodeType], - ctx: ConnectorContextV2, + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, + **kwargs, ) -> Any: + # Loop through all modules that created some output. # for mid in input_.keys(): # sa_module = ctx.rl_module.get_module(module_id=mid) # If our RLModule is stateful, remove the T=1 axis from all model outputs # (except the state outs, which never have this extra time axis). - if ctx.rl_module.is_stateful(): + if rl_module.is_stateful(): state = input_.pop(STATE_OUT, None) input_ = tree.map_structure(lambda s: np.squeeze(s, axis=1), input_) if state: @@ -60,17 +65,17 @@ def __call__( # Create a new action distribution object. action_dist = None if SampleBatch.ACTION_DIST_INPUTS in input_: - if ctx.explore: - action_dist_class = ctx.rl_module.get_exploration_action_dist_cls() + if explore: + action_dist_class = rl_module.get_exploration_action_dist_cls() else: - action_dist_class = ctx.rl_module.get_inference_action_dist_cls() + action_dist_class = rl_module.get_inference_action_dist_cls() action_dist = action_dist_class.from_logits( input_[SampleBatch.ACTION_DIST_INPUTS] ) # TODO (sven): Should this not already be taken care of by RLModule's # `get_...action_dist_cls()` methods? - if not ctx.explore: + if not explore: action_dist = action_dist.to_deterministic() # If `forward_...()` returned actions, use them here as-is. @@ -93,3 +98,12 @@ def __call__( input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions) return input_ + + # @override(Connector) + # def serialize(self): + # return ClipActions.__name__, None + + # @staticmethod + # TODO + # def from_state(ctx: ConnectorContext, params: Any): + # return ClipActions(ctx) From 1de7ebbd155fd9be415c443348f1a64288f1e87a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 14 Dec 2023 14:14:47 +0100 Subject: [PATCH 06/15] wip Signed-off-by: sven1977 --- rllib/connectors/connector_pipeline_v2.py | 29 +++++++++---- .../env_to_module/env_to_module_pipeline.py | 42 +++++++++++++------ .../learner/learner_connector_pipeline.py | 34 +++++++++------ .../module_to_env/module_to_env_pipeline.py | 28 +++++++++---- 4 files changed, 94 insertions(+), 39 deletions(-) diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 6e45a5792c695..7f9336fe710d2 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -11,6 +11,7 @@ DefaultLearnerConnector ) from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import EpisodeType from ray.util.annotations import PublicAPI from ray.util.timer import _Timer @@ -25,11 +26,10 @@ class ConnectorPipelineV2(ConnectorV2): def __init__( self, *, - ctx: ConnectorContextV2, connectors: Optional[List[ConnectorV2]] = None, **kwargs, ): - super().__init__(ctx=ctx, **kwargs) + super().__init__(**kwargs) self.connectors = connectors or [] self._fix_input_output_types() @@ -39,9 +39,11 @@ def __init__( @override(ConnectorV2) def __call__( self, + rl_module: RLModule, input_: Any, episodes: List[EpisodeType], - ctx: ConnectorContextV2, + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, **kwargs, ) -> Any: """In a pipeline, we simply call each of our connector pieces after each other. @@ -55,7 +57,14 @@ def __call__( for connector in self.connectors: timer = self.timers[str(connector)] with timer: - ret = connector(input_=ret, episodes=episodes, ctx=ctx) + ret = connector( + rl_module=rl_module, + input_=ret, + episodes=episodes, + explore=explore, + persistent_data=persistent_data, + **kwargs, + ) return ret def remove(self, name_or_class: Union[str, Type]): @@ -66,19 +75,21 @@ def remove(self, name_or_class: Union[str, Type]): """ idx = -1 for i, c in enumerate(self.connectors): - if c.__class__.__name__ == name: + if c.__class__.__name__ == name_or_class: idx = i break if idx >= 0: del self.connectors[idx] self._fix_input_output_types() - logger.info(f"Removed connector {name} from {self.__class__.__name__}.") + logger.info(f"Removed connector {name_or_class} from {self.__class__.__name__}.") else: - logger.warning(f"Trying to remove a non-existent connector {name}.") + logger.warning( + f"Trying to remove a non-existent connector {name_or_class}." + ) def insert_before( self, - name_or_class: Union[str, Type], + name_or_class: Union[str, type], connector: ConnectorV2, ) -> ConnectorV2: """Insert a new connector piece before an existing piece (by name or class). @@ -257,6 +268,8 @@ def _fix_input_output_types(self): if len(self.connectors) > 0: self.input_type = self.connectors[0].input_type self.output_type = self.connectors[-1].output_type + #self.observation_space = self.connectors[-1].observation_space + #self.action_space = self.connectors[-1].action_space else: self.input_type = None self.output_type = None diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index 3b985d3944886..e5b81c254589d 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -1,8 +1,10 @@ from typing import Any, List, Optional -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 +import gymnasium as gym + from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import EpisodeType @@ -12,11 +14,21 @@ class EnvToModulePipeline(ConnectorPipelineV2): def __init__( self, *, - ctx: ConnectorContextV2, connectors: Optional[List[ConnectorV2]] = None, + input_observation_space: Optional[gym.Space], + input_action_space: Optional[gym.Space], + env: Optional[gym.Env] = None, + rl_module: Optional["RLModule"] = None, **kwargs, ): - super().__init__(ctx=ctx, connectors=connectors, **kwargs) + super().__init__( + connectors=connectors, + input_observation_space=input_observation_space, + input_action_space=input_action_space, + env=env, + rl_module=rl_module, + **kwargs, + ) # Add the default final connector piece for env-to-module pipelines: # Extracting last obs from episodes and add them to input, iff this has not # happened in any connector piece in this pipeline before. @@ -24,24 +36,30 @@ def __init__( len(self.connectors) == 0 or type(self.connectors[-1]) is not DefaultEnvToModule ): - self.append(DefaultEnvToModule(ctx=ctx)) + self.append(DefaultEnvToModule( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + env=env, + )) @override(ConnectorPipelineV2) def __call__( self, *, + rl_module: RLModule, input_: Optional[Any] = None, episodes: List[EpisodeType], - ctx: ConnectorContextV2, + explore: bool, + persistent_data: Optional[dict] = None, **kwargs, - ) -> Any: - + ): + # Make sure user does not necessarily send initial input into this pipeline. + # Might just be empty and to be populated from `episodes`. return super().__call__( - # Make sure user does not have to send initial `input_` into this env-to-module - # pipeline. This would be the expected behavior b/c after calling the env, - # we don't have any data dict yet, only a list of Episode objects. - input_=input_ or {}, + rl_module=rl_module, + input_=input_ if input_ is not None else {}, episodes=episodes, - ctx=ctx, + explore=explore, + persistent_data=persistent_data, **kwargs, ) diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py index 5725f2a7a252e..78223f2c92f0e 100644 --- a/rllib/connectors/learner/learner_connector_pipeline.py +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -1,6 +1,7 @@ -from typing import Any, List, Optional +from typing import List, Optional + +import gymnasium as gym -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 from ray.rllib.connectors.learner.default_learner_connector import ( @@ -8,25 +9,34 @@ ) -class LearnerConnectorPipeline(ConnectorPipelineV2): - """The superclass for any module-to-env pipelines.""" - +class LearnerPipeline(ConnectorPipelineV2): def __init__( self, *, - ctx: ConnectorContextV2, connectors: Optional[List[ConnectorV2]] = None, + input_observation_space: Optional[gym.Space], + input_action_space: Optional[gym.Space], + env: Optional[gym.Env] = None, + rl_module: Optional["RLModule"] = None, **kwargs, ): - super().__init__(ctx=ctx, connectors=connectors, **kwargs) + super().__init__( + connectors=connectors, + input_observation_space=input_observation_space, + input_action_space=input_action_space, + env=env, + rl_module=rl_module, + **kwargs, + ) # Add the default final connector piece for learner pipelines: - # Makes sure observations from episodes are in the train batch as well as - # the correct state inputs in case the RLModule is stateful. In the latter case, - # also takes care of the time rank and zero padding. + # Making sure that we have - at the minimum - observations and that the data + # is time-ranked (if we have a stateful model) and properly zero-padded. if ( len(self.connectors) == 0 or type(self.connectors[-1]) is not DefaultLearnerConnector ): - # Append default learner connector piece at the end. - self.append(DefaultLearnerConnector(ctx=ctx)) + self.append(DefaultLearnerConnector( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + )) diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py index b1b3be1d35b48..a9621c3162c90 100644 --- a/rllib/connectors/module_to_env/module_to_env_pipeline.py +++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py @@ -1,22 +1,31 @@ -from typing import Any, List, Optional +from typing import List, Optional + +import gymnasium as gym -from ray.rllib.connectors.connector_context_v2 import ConnectorContextV2 from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv class ModuleToEnvPipeline(ConnectorPipelineV2): - """The superclass for any module-to-env pipelines.""" - def __init__( self, *, - ctx: ConnectorContextV2, connectors: Optional[List[ConnectorV2]] = None, + input_observation_space: Optional[gym.Space], + input_action_space: Optional[gym.Space], + env: Optional[gym.Env] = None, + rl_module: Optional["RLModule"] = None, **kwargs, ): - super().__init__(ctx=ctx, connectors=connectors, **kwargs) + super().__init__( + connectors=connectors, + input_observation_space=input_observation_space, + input_action_space=input_action_space, + env=env, + rl_module=rl_module, + **kwargs, + ) # Add the default final connector piece for env-to-module pipelines: # Sampling actions from action_dist_inputs and add them to input, iff this has @@ -25,4 +34,9 @@ def __init__( len(self.connectors) == 0 or type(self.connectors[-1]) is not DefaultModuleToEnv ): - self.append(DefaultModuleToEnv(ctx=ctx)) + self.append(DefaultModuleToEnv( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + env=env, + rl_module=rl_module, + )) From a9acbee7dec50d2d6cbf948f2bce48d2b8f3c5a2 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 14 Dec 2023 15:54:00 +0100 Subject: [PATCH 07/15] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 116 +++++++++++++ rllib/connectors/connector_v2.py | 18 +- .../env_to_module/frame_stacking.py | 120 +++++++++++++ .../env_to_module/prev_action_prev_reward.py | 6 +- rllib/connectors/learner/frame_stacking.py | 8 + .../learner/learner_connector_pipeline.py | 2 +- rllib/env/wrappers/atari_wrappers.py | 20 ++- ..._CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt | 0 .../connectors/connector_v2_frame_stacking.py | 164 ++++++++++++++++++ 9 files changed, 436 insertions(+), 18 deletions(-) create mode 100644 rllib/connectors/env_to_module/frame_stacking.py create mode 100644 rllib/connectors/learner/frame_stacking.py create mode 100644 rllib/examples/connectors/TODO_MOVE_OLD_CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt create mode 100644 rllib/examples/connectors/connector_v2_frame_stacking.py diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 1ee0761ae49fd..4d3c7d4de7d8b 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -100,6 +100,7 @@ if TYPE_CHECKING: from ray.rllib.algorithms.algorithm import Algorithm + from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.core.learner import Learner from ray.rllib.evaluation.episode import Episode as OldEpisode @@ -327,6 +328,8 @@ def __init__(self, algo_class=None): self.num_envs_per_worker = 1 self.create_env_on_local_worker = False self.enable_connectors = True + self._env_to_module_connector = None + self._module_to_env_connector = None # TODO (sven): Rename into `sample_timesteps` (or `sample_duration` # and `sample_duration_unit` (replacing batch_mode), like we do it # in the evaluation config). @@ -374,6 +377,7 @@ def __init__(self, algo_class=None): except AttributeError: pass + self._learner_connector = None self.optimizer = {} self.max_requests_in_flight_per_sampler_worker = 2 self._learner_class = None @@ -1137,6 +1141,95 @@ class directly. Note that this arg can also be specified via logger_creator=self.logger_creator, ) + def build_env_to_module_connector(self, env): + custom_connectors = [] + + # Create an env-to-module connector pipeline (including RLlib's default + # env->module connector piece) and return it. + if self._env_to_module_connector is not None: + val_ = self._env_to_module_connector(env) + + from ray.rllib.connectors.connector_v2 import ConnectorV2 + from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 + + if ( + isinstance(val_, ConnectorV2) + and not isinstance(val_, ConnectorPipelineV2) + ): + custom_connectors = [val_] + else: + return val_ + + from ray.rllib.connectors.env_to_module.env_to_module_pipeline import ( + EnvToModulePipeline + ) + + return EnvToModulePipeline( + connectors=custom_connectors, + input_observation_space=env.single_observation_space, + input_action_space=env.single_action_space, + env=env, + ) + + def build_module_to_env_connector(self, env): + custom_connectors = [] + + # Create a module-to-env connector pipeline (including RLlib's default + # module->env connector piece) and return it. + if self._module_to_env_connector is not None: + val_ = self._module_to_env_connector(env) + + from ray.rllib.connectors.connector_v2 import ConnectorV2 + from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 + + if ( + isinstance(val_, ConnectorV2) + and not isinstance(val_, ConnectorPipelineV2) + ): + custom_connectors = [val_] + else: + return val_ + + from ray.rllib.connectors.module_to_env.module_to_env_pipeline import ( + ModuleToEnvPipeline + ) + + return ModuleToEnvPipeline( + connectors=custom_connectors, + input_observation_space=env.single_observation_space, + input_action_space=env.single_action_space, + env=env, + ) + + def build_learner_connector(self, input_observation_space, input_action_space): + custom_connectors = [] + + # Create a learner connector pipeline (including RLlib's default + # learner connector piece) and return it. + if self._learner_connector is not None: + val_ = self._learner_connector(input_observation_space, input_action_space) + + from ray.rllib.connectors.connector_v2 import ConnectorV2 + from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 + + if ( + isinstance(val_, ConnectorV2) + and not isinstance(val_, ConnectorPipelineV2) + ): + custom_connectors = [val_] + else: + return val_ + + from ray.rllib.connectors.learner.learner_connector_pipeline import ( + LearnerConnectorPipeline + ) + + return LearnerConnectorPipeline( + connectors=custom_connectors, + input_observation_space=input_observation_space, + input_action_space=input_action_space, + ) + def python_environment( self, *, @@ -1477,6 +1570,12 @@ def rollouts( create_env_on_local_worker: Optional[bool] = NotProvided, sample_collector: Optional[Type[SampleCollector]] = NotProvided, enable_connectors: Optional[bool] = NotProvided, + env_to_module_connector: Optional[ + Callable[[EnvType], "ConnectorV2"] + ] = NotProvided, + module_to_env_connector: Optional[ + Callable[[EnvType, "RLModule"], "ConnectorV2"] + ] = NotProvided, use_worker_filter_stats: Optional[bool] = NotProvided, update_worker_filter_stats: Optional[bool] = NotProvided, rollout_fragment_length: Optional[Union[int, str]] = NotProvided, @@ -1522,6 +1621,11 @@ def rollouts( enable_connectors: Use connector based environment runner, so that all preprocessing of obs and postprocessing of actions are done in agent and action connectors. + env_to_module_connector: A callable taking an Env as input arg and returning + an env-to-module ConnectorV2 (might be a pipeline) object. + module_to_env_connector: A callable taking an Env and an RLModule as input + args and returning a module-to-env ConnectorV2 (might be a pipeline) + object. use_worker_filter_stats: Whether to use the workers in the WorkerSet to update the central filters (held by the local worker). If False, stats from the workers will not be used and discarded. @@ -1609,6 +1713,10 @@ def rollouts( self.create_env_on_local_worker = create_env_on_local_worker if enable_connectors is not NotProvided: self.enable_connectors = enable_connectors + if env_to_module_connector is not NotProvided: + self._env_to_module_connector = env_to_module_connector + if module_to_env_connector is not NotProvided: + self._module_to_env_connector = module_to_env_connector if use_worker_filter_stats is not NotProvided: self.use_worker_filter_stats = use_worker_filter_stats if update_worker_filter_stats is not NotProvided: @@ -1719,6 +1827,9 @@ def training( optimizer: Optional[dict] = NotProvided, max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided, learner_class: Optional[Type["Learner"]] = NotProvided, + learner_connector: Optional[ + Callable[["RLModule"], "ConnectorV2"] + ] = NotProvided, # Deprecated arg. _enable_learner_api: Optional[bool] = NotProvided, ) -> "AlgorithmConfig": @@ -1780,6 +1891,9 @@ def training( in your experiment of timesteps. learner_class: The `Learner` class to use for (distributed) updating of the RLModule. Only used when `_enable_new_api_stack=True`. + learner_connector: A callable taking an env observation space and an env + action space as inputs and returning a learner ConnectorV2 (might be + a pipeline) object. Returns: This updated AlgorithmConfig object. @@ -1824,6 +1938,8 @@ def training( ) if learner_class is not NotProvided: self._learner_class = learner_class + if learner_connector is not NotProvided: + self._learner_connector = learner_connector return self diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index b201c804ca2d0..89e4dca793305 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -79,19 +79,14 @@ def action_space(self, value): def __init__( self, *, - input_observation_space: Optional[gym.Space], - input_action_space: Optional[gym.Space], + input_observation_space: gym.Space, + input_action_space: gym.Space, env: Optional[gym.Env] = None, - #rl_module: Optional["RLModule"] = None, **kwargs, ): """Initializes a ConnectorV2 instance. Args: - env: An optional env object that the connector might need to know about. - Note that normally, env-to-module and module-to-env connectors get this - information at construction time, but learner connectors won't (b/c - Learner objects don't carry an environment object). input_observation_space: The (mandatory) input observation space. This is the space coming from a previous connector piece in the (env-to-module or learner) pipeline or it is directly defined within @@ -100,16 +95,15 @@ def __init__( is the space coming from a previous connector piece in the (module-to-env) pipeline or it is directly defined within the used gym.Env. - #rl_module: An optional RLModule object that the connector might need to know - # about. Note that normally, only module-to-env connectors get this - # information at construction time, but env-to-module and learner - # connectors won't (b/c they get constructed before the RLModule). + env: An optional env object that the connector might need to know about. + Note that normally, env-to-module and module-to-env connectors get this + information at construction time, but learner connectors won't (b/c + Learner objects don't carry an environment object). **kwargs: Forward API-compatibility kwargs. """ self.input_observation_space = input_observation_space self.input_action_space = input_action_space self.env = env - #self.rl_module = rl_module @abc.abstractmethod def __call__( diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py new file mode 100644 index 0000000000000..7d2f2012dc78e --- /dev/null +++ b/rllib/connectors/env_to_module/frame_stacking.py @@ -0,0 +1,120 @@ +from functools import partial +import numpy as np +from typing import Any, List, Optional + +import gymnasium as gym +import tree # pip install dm_tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.spaces.space_utils import batch, get_base_struct_from_space +from ray.rllib.utils.typing import EpisodeType + + +class _FrameStackingConnector(ConnectorV2): + """A connector piece that stacks the previous n observations into one.""" + + def __init__( + self, + *, + # Base class constructor args. + input_observation_space: gym.Space, + input_action_space: gym.Space, + env: Optional[gym.Env] = None, + # Specific framestacking args. + num_frames: int = 1, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a _FrameStackingConnector instance. + + Args: + num_frames: The number of observation frames to stack up (into a single + observation) for the RLModule's forward pass. + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to a env-to-module pipeline. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + env=env, + **kwargs, + ) + + self.num_frames = num_frames + self.as_learner_connector = as_learner_connector + + # Some assumptions: Space is box AND last dim (the stacking one) is 1. + assert isinstance(self.observation_space, gym.spaces.Box) + assert self.observation_space.shape[-1] == 1 + + # Change our observation space according to the given stacking settings. + self.observation_space = gym.spaces.Box( + low=np.repeat( + self.observation_space.low, repeats=self.num_frames, axis=-1 + ), + high=np.repeat( + self.observation_space.high, repeats=self.num_frames, axis=-1 + ), + shape=list(self.observation_space.shape)[:-1] + [self.num_frames], + dtype=self.observation_space.dtype, + ) + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + input_: Optional[Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # This is a data-in-data-out connector, so we expect `input_` to be a dict + # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule]. + # We will add to `input_` the last n observations. + + obs = [] + for episode in episodes: + + # Learner connector pipeline. Episodes have been finalized/numpy'ized. + if self.as_learner_connector: + # Loop through each timestep in the episode and add the previous n + # observations (based on that timestep) to the batch. + for ts in range(len(episode)): + obs.append( + episode.get_observations( + # Extract n observations from `ts` to `ts - n` + # (excluding `ts - n`). + indices=slice(ts - self.num_frames + 1, ts + 1), + # Make sure negative indices are NOT interpreted as "counting + # from the end", but as absolute indices meaning they refer + # to timesteps before 0 (which is the lookback buffer). + neg_indices_left_of_zero=True, + # In case we are at the very beginning of the episode, e.g. + # ts==0, fill the left side with zero-observations. + fill=0.0, + ) + ) + # Env-to-module pipeline. Episodes still operate on lists. + else: + assert not episode.is_finalized + obs.append( + batch( + episode.get_observations( + indices=slice(-self.num_frames + 1, None), + fill=0.0, + ) + ) + ) + + input_[SampleBatch.OBS] = batch(obs) + return input_ + + +FrameStackingEnvToModule = partial( + _FrameStackingConnector, as_learner_connector=False +) diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py index cf11edba10298..cb381b6e5e466 100644 --- a/rllib/connectors/env_to_module/prev_action_prev_reward.py +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -19,8 +19,8 @@ def __init__( self, *, # Base class constructor args. - input_observation_space: Optional[gym.Space], - input_action_space: Optional[gym.Space], + input_observation_space: gym.Space, + input_action_space: gym.Space, env: Optional[gym.Env] = None, # Specific prev. r/a args. n_prev_actions: int = 1, @@ -28,7 +28,7 @@ def __init__( as_learner_connector: bool = False, **kwargs, ): - """Initializes a PrevRewardPrevActionConnector instance. + """Initializes a _PrevRewardPrevActionConnector instance. Args: n_prev_actions: The number of previous actions to include in the output diff --git a/rllib/connectors/learner/frame_stacking.py b/rllib/connectors/learner/frame_stacking.py new file mode 100644 index 0000000000000..4eb0c09bd6e41 --- /dev/null +++ b/rllib/connectors/learner/frame_stacking.py @@ -0,0 +1,8 @@ +from functools import partial + +from ray.rllib.connectors.env_to_module.frame_stacking import _FrameStackingConnector + + +FrameStackingLearner = partial( + _FrameStackingConnector, as_learner_connector=True +) diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py index 78223f2c92f0e..acc9a9a1946a2 100644 --- a/rllib/connectors/learner/learner_connector_pipeline.py +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -9,7 +9,7 @@ ) -class LearnerPipeline(ConnectorPipelineV2): +class LearnerConnectorPipeline(ConnectorPipelineV2): def __init__( self, *, diff --git a/rllib/env/wrappers/atari_wrappers.py b/rllib/env/wrappers/atari_wrappers.py index 0dfd74729efae..2919685cf6bc5 100644 --- a/rllib/env/wrappers/atari_wrappers.py +++ b/rllib/env/wrappers/atari_wrappers.py @@ -240,6 +240,22 @@ def reset(self, **kwargs): return self.env.reset(**kwargs) +class NormalizedImageEnv(gym.ObservationWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.observation_space = gym.spaces.Box( + -1.0, + 1.0, + shape=self.observation_space.shape, + dtype=np.float32, + ) + + # Divide by scale and center around 0.0, such that observations are in the range + # of -1.0 and 1.0. + def observation(self, observation): + return (observation.astype(np.float32) / 128.0) - 1.0 + + @PublicAPI class WarpFrame(gym.ObservationWrapper): def __init__(self, env, dim): @@ -266,8 +282,8 @@ def __init__(self, env, k): self.frames = deque([], maxlen=k) shp = env.observation_space.shape self.observation_space = spaces.Box( - low=0, - high=255, + low=np.repeat(env.observation_space.low, repeats=k, axis=-1), + high=np.repeat(env.observation_space.high, repeats=k, axis=-1), shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype, ) diff --git a/rllib/examples/connectors/TODO_MOVE_OLD_CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt b/rllib/examples/connectors/TODO_MOVE_OLD_CONNECTOR_EXAMPLES_TO_SEPARATE_FOLDER.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py new file mode 100644 index 0000000000000..e2227fbb61be6 --- /dev/null +++ b/rllib/examples/connectors/connector_v2_frame_stacking.py @@ -0,0 +1,164 @@ +import argparse +from functools import partial +import os + +import gymnasium as gym + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule +from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner +from ray.rllib.env.wrappers.atari_wrappers import ( + EpisodicLifeEnv, + # FrameStack, # <- we do not want env-based frame stacking + MaxAndSkipEnv, + NoopResetEnv, + NormalizedImageEnv, + WarpFrame, # gray + resize +) +from ray.rllib.utils.test_utils import check_learning_achieved +from ray import tune + + +parser = argparse.ArgumentParser() +parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--num-frames", + type=int, + default=4, + help="The number of observation frames to stack.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=2000, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=400.0, help="Reward at which we stop training." +) + + +if __name__ == "__main__": + import ray + from ray import air, tune + + args = parser.parse_args() + + ray.init() + + # Define our custom connector pipelines. + def _make_env_to_module_connector(env): + # Create the env-to-module connector. We return an individual connector piece + # here, which RLlib will then automatically integrate into a pipeline (and + # add its default connector piece to the end of that pipeline). + return FrameStackingEnvToModule( + input_observation_space=env.single_observation_space, + input_action_space=env.single_action_space, + env=env, + num_frames=args.num_frames, + ) + + def _make_learner_connector(input_observation_space, input_action_space): + # Create the learner connector. + return FrameStackingLearner( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + num_frames=args.num_frames, + ) + + # Create a custom Atari setup (w/o the usual Rllib-hard-coded framestacking in it). + # We would like our frame stacking connector to do this job. + tune.register_env( + "env", + ( + lambda cfg: ( + EpisodicLifeEnv( # each life is one episode + MaxAndSkipEnv( # frameskip=4 and take max over these 4 frames + NoopResetEnv( # perform n noops after a reset + # partial(FrameStack, k=4)( # <- no env-based framestacking! + NormalizedImageEnv( + partial(WarpFrame, dim=64)( # grayscale + resize + partial(gym.wrappers.TimeLimit, max_episode_steps=108000)( + gym.make("ALE/Pong-v5", **dict( + cfg, **{"render_mode": "rgb_array"} + )) + ) + ))))) + ) + ), + ) + + config = ( + PPOConfig() + .framework(args.framework) + .environment( + "env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + # Use new API stack ... + .experimental(_enable_new_api_stack=True) + .rollouts( + # ... new EnvRunner and our frame stacking env-to-module connector. + env_runner_cls=SingleAgentEnvRunner, + env_to_module_connector=_make_env_to_module_connector, + ) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + .training( + # Use our frame stacking learner connector. + learner_connector=_make_learner_connector, + + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + entropy_coeff=0.01, + num_sgd_iter=10, + lr=0.00025, # needs to be adjusted: `lr=0.00025*num_learner_workers` + grad_clip=100.0, + grad_clip_by="global_norm", + model={ + "vf_share_layers": True, + "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + "conv_activation": "relu", + "post_fcnet_hiddens": [256], + }, + ) + ) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + tuner = tune.Tuner( + config.algo_class, + param_space=config, + run_config=air.RunConfig(stop=stop), + tune_config=tune.TuneConfig(num_samples=1), + ) + results = tuner.fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() From 5fe97e1aab7f3af1169a0527eae2129b82a311f8 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 14 Dec 2023 15:55:47 +0100 Subject: [PATCH 08/15] LINT Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 21 ++++++-------- rllib/connectors/connector_pipeline_v2.py | 10 ++++--- .../env_to_module/env_to_module_pipeline.py | 12 ++++---- .../env_to_module/frame_stacking.py | 28 ++++++++---------- .../env_to_module/prev_action_prev_reward.py | 22 +++++++------- rllib/connectors/learner/frame_stacking.py | 4 +-- .../learner/learner_connector_pipeline.py | 10 ++++--- .../module_to_env/module_to_env_pipeline.py | 14 +++++---- .../connectors/connector_v2_frame_stacking.py | 29 +++++++++++-------- 9 files changed, 77 insertions(+), 73 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 4d3c7d4de7d8b..0c18a4dfc3fd9 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1152,16 +1152,15 @@ def build_env_to_module_connector(self, env): from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 - if ( - isinstance(val_, ConnectorV2) - and not isinstance(val_, ConnectorPipelineV2) + if isinstance(val_, ConnectorV2) and not isinstance( + val_, ConnectorPipelineV2 ): custom_connectors = [val_] else: return val_ from ray.rllib.connectors.env_to_module.env_to_module_pipeline import ( - EnvToModulePipeline + EnvToModulePipeline, ) return EnvToModulePipeline( @@ -1182,16 +1181,15 @@ def build_module_to_env_connector(self, env): from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 - if ( - isinstance(val_, ConnectorV2) - and not isinstance(val_, ConnectorPipelineV2) + if isinstance(val_, ConnectorV2) and not isinstance( + val_, ConnectorPipelineV2 ): custom_connectors = [val_] else: return val_ from ray.rllib.connectors.module_to_env.module_to_env_pipeline import ( - ModuleToEnvPipeline + ModuleToEnvPipeline, ) return ModuleToEnvPipeline( @@ -1212,16 +1210,15 @@ def build_learner_connector(self, input_observation_space, input_action_space): from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 - if ( - isinstance(val_, ConnectorV2) - and not isinstance(val_, ConnectorPipelineV2) + if isinstance(val_, ConnectorV2) and not isinstance( + val_, ConnectorPipelineV2 ): custom_connectors = [val_] else: return val_ from ray.rllib.connectors.learner.learner_connector_pipeline import ( - LearnerConnectorPipeline + LearnerConnectorPipeline, ) return LearnerConnectorPipeline( diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 7f9336fe710d2..331e6294e5e58 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -8,7 +8,7 @@ from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv from ray.rllib.connectors.learner.default_learner_connector import ( - DefaultLearnerConnector + DefaultLearnerConnector, ) from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.annotations import override @@ -81,7 +81,9 @@ def remove(self, name_or_class: Union[str, Type]): if idx >= 0: del self.connectors[idx] self._fix_input_output_types() - logger.info(f"Removed connector {name_or_class} from {self.__class__.__name__}.") + logger.info( + f"Removed connector {name_or_class} from {self.__class__.__name__}." + ) else: logger.warning( f"Trying to remove a non-existent connector {name_or_class}." @@ -268,8 +270,8 @@ def _fix_input_output_types(self): if len(self.connectors) > 0: self.input_type = self.connectors[0].input_type self.output_type = self.connectors[-1].output_type - #self.observation_space = self.connectors[-1].observation_space - #self.action_space = self.connectors[-1].action_space + # self.observation_space = self.connectors[-1].observation_space + # self.action_space = self.connectors[-1].action_space else: self.input_type = None self.output_type = None diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index e5b81c254589d..b0f1027799a9f 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -36,11 +36,13 @@ def __init__( len(self.connectors) == 0 or type(self.connectors[-1]) is not DefaultEnvToModule ): - self.append(DefaultEnvToModule( - input_observation_space=self.observation_space, - input_action_space=self.action_space, - env=env, - )) + self.append( + DefaultEnvToModule( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + env=env, + ) + ) @override(ConnectorPipelineV2) def __call__( diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py index 7d2f2012dc78e..090d9fcda2205 100644 --- a/rllib/connectors/env_to_module/frame_stacking.py +++ b/rllib/connectors/env_to_module/frame_stacking.py @@ -17,16 +17,16 @@ class _FrameStackingConnector(ConnectorV2): """A connector piece that stacks the previous n observations into one.""" def __init__( - self, - *, - # Base class constructor args. - input_observation_space: gym.Space, - input_action_space: gym.Space, - env: Optional[gym.Env] = None, - # Specific framestacking args. - num_frames: int = 1, - as_learner_connector: bool = False, - **kwargs, + self, + *, + # Base class constructor args. + input_observation_space: gym.Space, + input_action_space: gym.Space, + env: Optional[gym.Env] = None, + # Specific framestacking args. + num_frames: int = 1, + as_learner_connector: bool = False, + **kwargs, ): """Initializes a _FrameStackingConnector instance. @@ -52,9 +52,7 @@ def __init__( # Change our observation space according to the given stacking settings. self.observation_space = gym.spaces.Box( - low=np.repeat( - self.observation_space.low, repeats=self.num_frames, axis=-1 - ), + low=np.repeat(self.observation_space.low, repeats=self.num_frames, axis=-1), high=np.repeat( self.observation_space.high, repeats=self.num_frames, axis=-1 ), @@ -115,6 +113,4 @@ def __call__( return input_ -FrameStackingEnvToModule = partial( - _FrameStackingConnector, as_learner_connector=False -) +FrameStackingEnvToModule = partial(_FrameStackingConnector, as_learner_connector=False) diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py index cb381b6e5e466..a7284dd582377 100644 --- a/rllib/connectors/env_to_module/prev_action_prev_reward.py +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -16,17 +16,17 @@ class _PrevRewardPrevActionConnector(ConnectorV2): """A connector piece that adds previous rewards and actions to the input.""" def __init__( - self, - *, - # Base class constructor args. - input_observation_space: gym.Space, - input_action_space: gym.Space, - env: Optional[gym.Env] = None, - # Specific prev. r/a args. - n_prev_actions: int = 1, - n_prev_rewards: int = 1, - as_learner_connector: bool = False, - **kwargs, + self, + *, + # Base class constructor args. + input_observation_space: gym.Space, + input_action_space: gym.Space, + env: Optional[gym.Env] = None, + # Specific prev. r/a args. + n_prev_actions: int = 1, + n_prev_rewards: int = 1, + as_learner_connector: bool = False, + **kwargs, ): """Initializes a _PrevRewardPrevActionConnector instance. diff --git a/rllib/connectors/learner/frame_stacking.py b/rllib/connectors/learner/frame_stacking.py index 4eb0c09bd6e41..f53a62bd6a726 100644 --- a/rllib/connectors/learner/frame_stacking.py +++ b/rllib/connectors/learner/frame_stacking.py @@ -3,6 +3,4 @@ from ray.rllib.connectors.env_to_module.frame_stacking import _FrameStackingConnector -FrameStackingLearner = partial( - _FrameStackingConnector, as_learner_connector=True -) +FrameStackingLearner = partial(_FrameStackingConnector, as_learner_connector=True) diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py index acc9a9a1946a2..766654815ce4c 100644 --- a/rllib/connectors/learner/learner_connector_pipeline.py +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -36,7 +36,9 @@ def __init__( len(self.connectors) == 0 or type(self.connectors[-1]) is not DefaultLearnerConnector ): - self.append(DefaultLearnerConnector( - input_observation_space=self.observation_space, - input_action_space=self.action_space, - )) + self.append( + DefaultLearnerConnector( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + ) + ) diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py index a9621c3162c90..130ed813f6f78 100644 --- a/rllib/connectors/module_to_env/module_to_env_pipeline.py +++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py @@ -34,9 +34,11 @@ def __init__( len(self.connectors) == 0 or type(self.connectors[-1]) is not DefaultModuleToEnv ): - self.append(DefaultModuleToEnv( - input_observation_space=self.observation_space, - input_action_space=self.action_space, - env=env, - rl_module=rl_module, - )) + self.append( + DefaultModuleToEnv( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + env=env, + rl_module=rl_module, + ) + ) diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py index e2227fbb61be6..93609f22cbd4f 100644 --- a/rllib/examples/connectors/connector_v2_frame_stacking.py +++ b/rllib/examples/connectors/connector_v2_frame_stacking.py @@ -17,7 +17,6 @@ WarpFrame, # gray + resize ) from ray.rllib.utils.test_utils import check_learning_achieved -from ray import tune parser = argparse.ArgumentParser() @@ -86,17 +85,24 @@ def _make_learner_connector(input_observation_space, input_action_space): ( lambda cfg: ( EpisodicLifeEnv( # each life is one episode - MaxAndSkipEnv( # frameskip=4 and take max over these 4 frames - NoopResetEnv( # perform n noops after a reset - # partial(FrameStack, k=4)( # <- no env-based framestacking! - NormalizedImageEnv( - partial(WarpFrame, dim=64)( # grayscale + resize - partial(gym.wrappers.TimeLimit, max_episode_steps=108000)( - gym.make("ALE/Pong-v5", **dict( - cfg, **{"render_mode": "rgb_array"} - )) + MaxAndSkipEnv( # frameskip=4 and take max over these 4 frames + NoopResetEnv( # perform n noops after a reset + # partial(FrameStack, k=4)( # <- no env-based framestacking! + NormalizedImageEnv( + partial(WarpFrame, dim=64)( # grayscale + resize + partial( + gym.wrappers.TimeLimit, max_episode_steps=108000 + )( + gym.make( + "ALE/Pong-v5", + **dict(cfg, **{"render_mode": "rgb_array"}) + ) + ) + ) + ) + ) + ) ) - ))))) ) ), ) @@ -125,7 +131,6 @@ def _make_learner_connector(input_observation_space, input_action_space): .training( # Use our frame stacking learner connector. learner_connector=_make_learner_connector, - lambda_=0.95, kl_coeff=0.5, clip_param=0.1, From 213f0d122efe16325cd9275d32bc3b05f401eed1 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 14 Dec 2023 16:18:27 +0100 Subject: [PATCH 09/15] LINT Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 1 + rllib/connectors/connector_pipeline_v2.py | 15 +-- rllib/connectors/connector_v2.py | 2 +- .../env_to_module/frame_stacking.py | 14 +-- .../env_to_module/prev_action_prev_reward.py | 13 ++- .../learner/learner_connector_pipeline.py | 3 +- .../module_to_env/module_to_env_pipeline.py | 3 +- .../tests/test_from_module_connectors.py | 106 ------------------ .../connectors/connector_v2_frame_stacking.py | 2 +- 9 files changed, 27 insertions(+), 132 deletions(-) delete mode 100644 rllib/connectors/tests/test_from_module_connectors.py diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 0c18a4dfc3fd9..4c181be809ea8 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -102,6 +102,7 @@ from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.core.learner import Learner + from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.evaluation.episode import Episode as OldEpisode logger = logging.getLogger(__name__) diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 331e6294e5e58..893dfcb57b49e 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -1,15 +1,8 @@ from collections import defaultdict import logging -from typing import Any, List, Optional, Union - -import gymnasium as gym +from typing import Any, Dict, List, Optional, Type, Union from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule -from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv -from ray.rllib.connectors.learner.default_learner_connector import ( - DefaultLearnerConnector, -) from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import EpisodeType @@ -215,7 +208,7 @@ def set_state(self, state: Dict[str, Any]): raise e return ConnectorPipelineV2(ctx, connectors) - def __str__(self, indentation: int = 0): + def __repr__(self, indentation: int = 0): return "\n".join( [" " * indentation + self.__class__.__name__] + [c.__str__(indentation + 4) for c in self.connectors] @@ -270,6 +263,10 @@ def _fix_input_output_types(self): if len(self.connectors) > 0: self.input_type = self.connectors[0].input_type self.output_type = self.connectors[-1].output_type + # TODO (sven): Create some examples for pipelines, in which the spaces + # are changed several times by the individual pieces. + # self.input_observation_space = self.connectors[0].input_observation_space + # self.input_action_space = self.connectors[0].input_action_space # self.observation_space = self.connectors[-1].observation_space # self.action_space = self.connectors[-1].action_space else: diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index 89e4dca793305..2ce20dd871b6f 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -1,5 +1,5 @@ import abc -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional import gymnasium as gym diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py index 090d9fcda2205..c6ac262da0ae7 100644 --- a/rllib/connectors/env_to_module/frame_stacking.py +++ b/rllib/connectors/env_to_module/frame_stacking.py @@ -3,13 +3,12 @@ from typing import Any, List, Optional import gymnasium as gym -import tree # pip install dm_tree from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override -from ray.rllib.utils.spaces.space_utils import batch, get_base_struct_from_space +from ray.rllib.utils.spaces.space_utils import batch from ray.rllib.utils.typing import EpisodeType @@ -72,8 +71,8 @@ def __call__( **kwargs, ) -> Any: # This is a data-in-data-out connector, so we expect `input_` to be a dict - # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule]. - # We will add to `input_` the last n observations. + # with: key=column name, e.g. "obs" and value=[data to be processed by + # RLModule]. We will add to `input_` the last n observations. obs = [] for episode in episodes: @@ -88,9 +87,10 @@ def __call__( # Extract n observations from `ts` to `ts - n` # (excluding `ts - n`). indices=slice(ts - self.num_frames + 1, ts + 1), - # Make sure negative indices are NOT interpreted as "counting - # from the end", but as absolute indices meaning they refer - # to timesteps before 0 (which is the lookback buffer). + # Make sure negative indices are NOT interpreted as + # "counting from the end", but as absolute indices meaning + # they refer to timesteps before 0 (which is the lookback + # buffer). neg_indices_left_of_zero=True, # In case we are at the very beginning of the episode, e.g. # ts==0, fill the left side with zero-observations. diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py index a7284dd582377..7f0caea909e29 100644 --- a/rllib/connectors/env_to_module/prev_action_prev_reward.py +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -62,9 +62,9 @@ def __call__( **kwargs, ) -> Any: # This is a data-in-data-out connector, so we expect `input_` to be a dict - # with: key=column name, e.g. "obs" and value=[data to be processed by RLModule]. - # We will just extract the most recent rewards and/or most recent actions from - # all episodes and store them inside the `input_` data dict. + # with: key=column name, e.g. "obs" and value=[data to be processed by + # RLModule]. We will just extract the most recent rewards and/or most recent + # actions from all episodes and store them inside the `input_` data dict. prev_a = [] prev_r = [] @@ -82,9 +82,10 @@ def __call__( episode.get_actions( # Extract n actions from `ts - n` to `ts` (excluding `ts`). indices=slice(ts - self.n_prev_actions, ts), - # Make sure negative indices are NOT interpreted as "counting - # from the end", but as absolute indices meaning they refer - # to timesteps before 0 (which is the lookback buffer). + # Make sure negative indices are NOT interpreted as + # "counting from the end", but as absolute indices meaning + # they refer to timesteps before 0 (which is the lookback + # buffer). neg_indices_left_of_zero=True, # In case we are at the very beginning of the episode, e.g. # ts==0, fill the left side with zero-actions. diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py index 766654815ce4c..88a1ad49c02d1 100644 --- a/rllib/connectors/learner/learner_connector_pipeline.py +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -7,6 +7,7 @@ from ray.rllib.connectors.learner.default_learner_connector import ( DefaultLearnerConnector, ) +from ray.rllib.core.rl_module.rl_module import RLModule class LearnerConnectorPipeline(ConnectorPipelineV2): @@ -17,7 +18,7 @@ def __init__( input_observation_space: Optional[gym.Space], input_action_space: Optional[gym.Space], env: Optional[gym.Env] = None, - rl_module: Optional["RLModule"] = None, + rl_module: Optional[RLModule] = None, **kwargs, ): super().__init__( diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py index 130ed813f6f78..2abcecf439d57 100644 --- a/rllib/connectors/module_to_env/module_to_env_pipeline.py +++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py @@ -4,6 +4,7 @@ from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv @@ -15,7 +16,7 @@ def __init__( input_observation_space: Optional[gym.Space], input_action_space: Optional[gym.Space], env: Optional[gym.Env] = None, - rl_module: Optional["RLModule"] = None, + rl_module: Optional[RLModule] = None, **kwargs, ): super().__init__( diff --git a/rllib/connectors/tests/test_from_module_connectors.py b/rllib/connectors/tests/test_from_module_connectors.py deleted file mode 100644 index ac0844ff46f0f..0000000000000 --- a/rllib/connectors/tests/test_from_module_connectors.py +++ /dev/null @@ -1,106 +0,0 @@ -import unittest - -import gymnasium as gym -import numpy as np - -from ray.rllib.connectors.into_env.clip_actions import ClipActions -from ray.rllib.connectors.into_env.unsquash_actions import UnsquashActions -from ray.rllib.connectors.connector import ConnectorContextV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.test_utils import check - - -class TestFromModuleConnectors(unittest.TestCase): - def test_connector_pipeline(self): - ctx = ConnectorContext() - connectors = [ConvertToNumpyConnector(ctx)] - pipeline = ActionConnectorPipeline(ctx, connectors) - name, params = pipeline.serialize() - restored = get_connector(name, ctx, params) - self.assertTrue(isinstance(restored, ActionConnectorPipeline)) - self.assertTrue(isinstance(restored.connectors[0], ConvertToNumpyConnector)) - # There should not be any timer yet - self.assertFalse(bool(pipeline.timers.values())) - pipeline(ActionConnectorDataType(0, 0, {}, ([1], [], None))) - # After a first input, there should be one timer - self.assertEquals(len(pipeline.timers.values()), 1) - - def test_clip_actions_connector(self): - ctx = ConnectorContextV2() - - connector = ClipActions( - action_space=gym.spaces.Box(low=0.0, high=6.0, shape=(1,)) - ) - - # name, params = connector.serialize() - # self.assertEqual(name, "ClipActions") - - # restored = get_connector(name, ctx, params) - # self.assertTrue(isinstance(restored, ClipActionsConnector)) - - for action in [8.8, 6.0, -0.2, 0.0, 5.9999, 3.2, 6.1]: - output = connector( - {SampleBatch.ACTIONS: np.array([action])}, - ctx, - ) - check(output[SampleBatch.ACTIONS], np.clip(action, 0.0, 6.0)) - - connector = ClipActions( - action_space=gym.spaces.Dict( - { - "a": gym.spaces.Box(low=-1.0, high=1.0, shape=(2,)), - "b": gym.spaces.Discrete(3), - } - ) - ) - for action in [ - {"a": np.array([8.8, 8.9]), "b": 1}, - {"a": np.array([9.0, -1.0]), "b": 0}, - {"a": np.array([100.0, 200.0]), "b": 2}, - {"a": np.array([-1000, 0.0001]), "b": 2}, - {"a": np.array([0.4, 1.2]), "b": 0}, - {"a": np.array([1.0, -1.0]), "b": 1}, - ]: - output = connector({SampleBatch.ACTIONS: action}, ctx) - check( - output[SampleBatch.ACTIONS], - {"a": np.clip(action["a"], -1.0, 1.0), "b": action["b"]}, - ) - - def test_unsquash_actions_connector(self): - ctx = ConnectorContextV2() - - connector = UnsquashActions( - action_space=gym.spaces.Box(low=-2.0, high=6.0, shape=(2,)) - ) - - # name, params = connector.serialize() - # self.assertEqual(name, "UnsquashActions") - - # restored = get_connector(name, ctx, params) - # self.assertTrue(isinstance(restored, NormalizeActionsConnector)) - - for action in [ - [1.8, 1.8], - [1.0, -1.0], - [-1.0, 1.1], - [0.0, 0.0], - [10.0, 0.5], - [0.5, -0.5], - ]: - action = np.array(action) - output = connector( - {SampleBatch.ACTIONS: action}, - ctx, - ) - check( - output[SampleBatch.ACTIONS], - np.clip((action + 1.0) * 4.0 - 2.0, -2.0, 6.0), - ) - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py index 93609f22cbd4f..ab45623a5562a 100644 --- a/rllib/examples/connectors/connector_v2_frame_stacking.py +++ b/rllib/examples/connectors/connector_v2_frame_stacking.py @@ -87,7 +87,7 @@ def _make_learner_connector(input_observation_space, input_action_space): EpisodicLifeEnv( # each life is one episode MaxAndSkipEnv( # frameskip=4 and take max over these 4 frames NoopResetEnv( # perform n noops after a reset - # partial(FrameStack, k=4)( # <- no env-based framestacking! + # partial(FrameStack, k=4)( # <- no env-based framestacking NormalizedImageEnv( partial(WarpFrame, dim=64)( # grayscale + resize partial( From 49585974b9061691eda5c1ca2eefe7c2e5ada582 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 18 Dec 2023 18:36:46 +0100 Subject: [PATCH 10/15] wip Signed-off-by: sven1977 --- rllib/connectors/connector_pipeline_v2.py | 12 +- rllib/connectors/connector_v2.py | 130 ++++++++++++++++------ 2 files changed, 101 insertions(+), 41 deletions(-) diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 893dfcb57b49e..ce04db32a71b7 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -197,7 +197,7 @@ def get_state(self): return ConnectorPipelineV2.__name__, children @override(ConnectorV2) - def set_state(self, state: Dict[str, Any]): + def set_state(self, state: Dict[str, Any]) -> None: connectors = [] for state in params: try: @@ -265,10 +265,12 @@ def _fix_input_output_types(self): self.output_type = self.connectors[-1].output_type # TODO (sven): Create some examples for pipelines, in which the spaces # are changed several times by the individual pieces. - # self.input_observation_space = self.connectors[0].input_observation_space - # self.input_action_space = self.connectors[0].input_action_space - # self.observation_space = self.connectors[-1].observation_space - # self.action_space = self.connectors[-1].action_space + self.input_observation_space = self.connectors[0].input_observation_space + self.input_action_space = self.connectors[0].input_action_space + self._observation_space = self.connectors[-1].observation_space + self._action_space = self.connectors[-1].action_space else: self.input_type = None self.output_type = None + self._observation_space = None + self._action_space = None diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index 2ce20dd871b6f..c0796d8b94a0f 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -31,24 +31,28 @@ class ConnectorV2(abc.ABC): connectors used in Learner pipelines). From this input data, a ConnectorV2 then performs a transformation step. - There are 3 types of pipelines a ConnectorV2 can belong to: - 1) env-to-module: The connector transforms envrionment data before it gets to the - RLModule. This type of pipeline is used by an EnvRunner for transforming - env output data to RLModule readable data (for the next RLModule forward pass). - 2) module-to-env: The connector transforms RLModule outputs before they are sent - back to the environment (as actions). This type of pipeline is used by an EnvRunner - to transform RLModule output data to env readable actions (for the next - `env.step()` call). - 3) learner pipeline: The connector transforms data coming directly from an - environment sampling step or a replay buffer and will be sent into the RLModule's - `forward_train()` method afterwards to compute the loss inputs. This type of - pipeline is used by a Learner to transform raw training data (a batch or a list of - episodes) to RLModule readable training data (for the next RLModule - `forward_train()` call). + There are 3 types of pipelines any ConnectorV2 piece can belong to: + 1) EnvToModulePipeline: The connector transforms environment data before it gets to + the RLModule. This type of pipeline is used by an EnvRunner for transforming + env output data into RLModule readable data (for the next RLModule forward pass). + For example, such a pipeline would include observation postprocessors, -filters, + or any RNN preparation code related to time-sequences and zero-padding. + 2) ModuleToEnvPipeline: This type of pipeline is used by an + EnvRunner to transform RLModule output data to env readable actions (for the next + `env.step()` call). For example, in case the RLModule only outputs action + distribution parameters (but not actual actions), the ModuleToEnvPipeline would + take care of sampling the actions to be sent back to the end from the + resulting distribution (made deterministic if exploration is off). + 3) LearnerConnectorPipeline: This connector pipeline type transforms data coming + from an `EnvRunner.sample()` call or a replay buffer and will then be sent into the + RLModule's `forward_train()` method in order to compute loss function inputs. + This type of pipeline is used by a Learner worker to transform raw training data + (a batch or a list of episodes) to RLModule readable training data (for the next + RLModule `forward_train()` call). Some connectors might be stateful, for example for keeping track of observation filtering stats (mean and stddev values). Any Algorithm, which uses connectors is - responsible for frequenly synchronizing the states of all connectors and connector + responsible for frequently synchronizing the states of all connectors and connector pipelines between the EnvRunners (owning the env-to-module and module-to-env pipelines) and the Learners (owning the Learner pipelines). """ @@ -62,49 +66,83 @@ class ConnectorV2(abc.ABC): @property def observation_space(self): - return self.input_observation_space + """Getter for our (output) observation space. + + Logic: Use user provided space (if set via `observation_space` setter) + otherwise, use the same as the input space, assuming this connector piece + does not alter the space. + """ + return self._observation_space or self.input_observation_space @observation_space.setter def observation_space(self, value): - self.observation_space = value + """Setter for our (output) observation space.""" + self._observation_space = value @property def action_space(self): - return self.input_action_space + """Getter for our (output) action space. + + Logic: Use user provided space (if set via `action_space` setter) + otherwise, use the same as the input space, assuming this connector piece + does not alter the space. + """ + return self._action_space or self.input_action_space @action_space.setter def action_space(self, value): - self.action_space = value + """Setter for our (output) action space.""" + self._action_space = value def __init__( self, *, - input_observation_space: gym.Space, - input_action_space: gym.Space, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, env: Optional[gym.Env] = None, **kwargs, ): """Initializes a ConnectorV2 instance. Args: - input_observation_space: The (mandatory) input observation space. This + input_observation_space: An optional input observation space. This is the space coming from a previous connector piece in the (env-to-module or learner) pipeline or it is directly defined within - the used gym.Env. - input_action_space: The (mandatory) input action space. This + the used gym.Env. If None, `env` must be provided. + input_action_space: An optional input action space. This is the space coming from a previous connector piece in the (module-to-env) pipeline or it is directly defined within the used - gym.Env. + gym.Env. If None, `env` must be provided. env: An optional env object that the connector might need to know about. Note that normally, env-to-module and module-to-env connectors get this information at construction time, but learner connectors won't (b/c Learner objects don't carry an environment object). **kwargs: Forward API-compatibility kwargs. """ + # Infer spaces from `env` argument if spaces are not explicitly provided. + if input_observation_space is None or input_action_space is None: + if env is None: + raise ValueError( + "`env` argument must be provided if `input_observation_space` or " + "`input_action_space` are None!" + ) + if input_observation_space is None: + input_observation_space = ( + env.single_observation_space if isinstance(env, gym.vector.Env) + else env.observation_space + ) + if input_action_space is None: + input_action_space = ( + env.single_action_space if isinstance(env, gym.vector.Env) + else env.action_space + ) self.input_observation_space = input_observation_space self.input_action_space = input_action_space self.env = env + self._observation_space = None + self._action_space = None + @abc.abstractmethod def __call__( self, @@ -140,25 +178,45 @@ def __call__( The transformed connector output abiding to `self.output_type`. """ - def __str__(self, indentation: int = 0): - return " " * indentation + self.__class__.__name__ - def get_state(self) -> Dict[str, Any]: - """Returns the current state of this ConnectorV2. - - Used for checkpointing (connectors may be stateful) as well as synchronization - between connectors that are run on the (distributed) EnvRunners vs those that - run on the (distributed) Learners. + """Returns the current state of this ConnectorV2 as a state dict. Returns: - A dict mapping str keys to state information. + A state dict mapping any string keys to their (state-defining) values. """ return {} def set_state(self, state: Dict[str, Any]) -> None: - """Sets the state of this connector to the provided one. + """Sets the state of this ConnectorV2 to the given value. Args: - state: The new state to set this connector to. + state: The state dict to define this ConnectorV2's new state. + """ + pass + + def reset_state(self) -> None: + """Resets the state of this ConnectorV2 to some initial value. + + Note that this may NOT be the exact state that this ConnectorV2 was originally + constructed with. """ pass + + @staticmethod + def merge_states(states: List[Dict[str, Any]]) -> Dict[str, Any]: + """Computes a resulting state given a list of other state dicts. + + Algorithms should use this method for synchronizing states between connectors + running on workers (of the same type, e.g. EnvRunner workers). + + Args: + states: The list of n other ConnectorV2 states to merge into a single + resulting state. + + Returns: + The resulting state dict. + """ + return {} + + def __str__(self, indentation: int = 0): + return " " * indentation + self.__class__.__name__ From bdf803d47b7ff547753754d71786e603734e6db1 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 19 Dec 2023 21:20:21 +0100 Subject: [PATCH 11/15] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm.py | 3 +- rllib/algorithms/algorithm_config.py | 77 ++++++---- .../connectors/{utils => common}/__init__.py | 0 rllib/connectors/common/frame_stacking.py | 136 ++++++++++++++++++ rllib/connectors/connector_v2.py | 45 ++---- rllib/connectors/env_to_module/__init__.py | 4 + .../env_to_module/default_env_to_module.py | 2 +- .../env_to_module/env_to_module_pipeline.py | 39 +---- .../env_to_module/frame_stacking.py | 112 +-------------- .../env_to_module/prev_action_prev_reward.py | 2 - rllib/connectors/learner/__init__.py | 11 ++ rllib/connectors/learner/frame_stacking.py | 2 +- .../learner/learner_connector_pipeline.py | 42 +----- rllib/connectors/module_to_env/__init__.py | 9 ++ .../module_to_env/default_module_to_env.py | 68 +++++++-- .../module_to_env/module_to_env_pipeline.py | 42 +----- .../connectors/connector_v2_frame_stacking.py | 21 ++- 17 files changed, 306 insertions(+), 309 deletions(-) rename rllib/connectors/{utils => common}/__init__.py (100%) create mode 100644 rllib/connectors/common/frame_stacking.py diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 170675e0e3956..2d68b89e53ba6 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -751,7 +751,8 @@ def setup(self, config: AlgorithmConfig) -> None: ) # Only when using RolloutWorkers: Update also the worker set's - # `should_module_be_updated_fn` (analogous to is_policy_to_train). + # `is_policy_to_train` (analogous to LearnerGroup's + # `should_module_be_updated_fn`). # Note that with the new EnvRunner API in combination with the new stack, # this information only needs to be kept in the LearnerGroup and not on the # EnvRunners anymore. diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 88764c82204df..8b2620241198f 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1158,90 +1158,119 @@ class directly. Note that this arg can also be specified via ) def build_env_to_module_connector(self, env): - custom_connectors = [] + from ray.rllib.connectors.env_to_module import ( + EnvToModulePipeline, + DefaultEnvToModule, + ) + custom_connectors = [] # Create an env-to-module connector pipeline (including RLlib's default # env->module connector piece) and return it. if self._env_to_module_connector is not None: val_ = self._env_to_module_connector(env) from ray.rllib.connectors.connector_v2 import ConnectorV2 - from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 if isinstance(val_, ConnectorV2) and not isinstance( - val_, ConnectorPipelineV2 + val_, EnvToModulePipeline ): custom_connectors = [val_] + elif isinstance(val_, (list, tuple)): + custom_connectors = list(val_) else: return val_ - from ray.rllib.connectors.env_to_module.env_to_module_pipeline import ( - EnvToModulePipeline, - ) - - return EnvToModulePipeline( + pipeline = EnvToModulePipeline( connectors=custom_connectors, input_observation_space=env.single_observation_space, input_action_space=env.single_action_space, env=env, ) + pipeline.append( + DefaultEnvToModule( + input_observation_space=pipeline.observation_space, + input_action_space=pipeline.action_space, + env=env, + ) + ) + return pipeline def build_module_to_env_connector(self, env): - custom_connectors = [] + from ray.rllib.connectors.module_to_env import ( + DefaultModuleToEnv, + ModuleToEnvPipeline, + ) + + custom_connectors = [] # Create a module-to-env connector pipeline (including RLlib's default # module->env connector piece) and return it. if self._module_to_env_connector is not None: val_ = self._module_to_env_connector(env) from ray.rllib.connectors.connector_v2 import ConnectorV2 - from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 if isinstance(val_, ConnectorV2) and not isinstance( - val_, ConnectorPipelineV2 + val_, ModuleToEnvPipeline ): custom_connectors = [val_] + elif isinstance(val_, (list, tuple)): + custom_connectors = list(val_) else: return val_ - from ray.rllib.connectors.module_to_env.module_to_env_pipeline import ( - ModuleToEnvPipeline, - ) - - return ModuleToEnvPipeline( + pipeline = ModuleToEnvPipeline( connectors=custom_connectors, input_observation_space=env.single_observation_space, input_action_space=env.single_action_space, env=env, ) + pipeline.append( + DefaultModuleToEnv( + input_observation_space=pipeline.observation_space, + input_action_space=pipeline.action_space, + env=env, + normalize_actions=self.normalize_actions, + clip_actions=self.clip_actions, + ) + ) + return pipeline def build_learner_connector(self, input_observation_space, input_action_space): - custom_connectors = [] + from ray.rllib.connectors.learner import ( + DefaultLearnerConnector, + LearnerConnectorPipeline, + ) + custom_connectors = [] # Create a learner connector pipeline (including RLlib's default # learner connector piece) and return it. if self._learner_connector is not None: val_ = self._learner_connector(input_observation_space, input_action_space) from ray.rllib.connectors.connector_v2 import ConnectorV2 - from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 if isinstance(val_, ConnectorV2) and not isinstance( - val_, ConnectorPipelineV2 + val_, LearnerConnectorPipeline ): custom_connectors = [val_] + elif isinstance(val_, (list, tuple)): + custom_connectors = list(val_) else: return val_ - from ray.rllib.connectors.learner.learner_connector_pipeline import ( - LearnerConnectorPipeline, - ) - - return LearnerConnectorPipeline( + pipeline = LearnerConnectorPipeline( connectors=custom_connectors, input_observation_space=input_observation_space, input_action_space=input_action_space, ) + pipeline.append( + DefaultLearnerConnector( + input_observation_space=pipeline.observation_space, + input_action_space=pipeline.action_space, + ) + ) + return pipeline def build_learner_group( self, diff --git a/rllib/connectors/utils/__init__.py b/rllib/connectors/common/__init__.py similarity index 100% rename from rllib/connectors/utils/__init__.py rename to rllib/connectors/common/__init__.py diff --git a/rllib/connectors/common/frame_stacking.py b/rllib/connectors/common/frame_stacking.py new file mode 100644 index 0000000000000..2f587ee083f1e --- /dev/null +++ b/rllib/connectors/common/frame_stacking.py @@ -0,0 +1,136 @@ +import numpy as np +from typing import Any, List, Optional + +import gymnasium as gym +import tree # pip install dm_tree + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.spaces.space_utils import batch +from ray.rllib.utils.typing import EpisodeType + + +class _FrameStackingConnector(ConnectorV2): + """A connector piece that stacks the previous n observations into one.""" + + def __init__( + self, + *, + # Base class constructor args. + input_observation_space: gym.Space, + input_action_space: gym.Space, + # Specific framestacking args. + num_frames: int = 1, + as_learner_connector: bool = False, + **kwargs, + ): + """Initializes a _FrameStackingConnector instance. + + Args: + num_frames: The number of observation frames to stack up (into a single + observation) for the RLModule's forward pass. + as_preprocessor: Whether this connector should simply postprocess the + received observations from the env and store these directly in the + episode object. In this mode, the connector can only be used in + an `EnvToModulePipeline` and it will act as a classic + RLlib framestacking postprocessor. + as_learner_connector: Whether this connector is part of a Learner connector + pipeline, as opposed to an env-to-module pipeline. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + + self.num_frames = num_frames + self.as_learner_connector = as_learner_connector + + # Some assumptions: Space is box AND last dim (the stacking one) is 1. + assert isinstance(self.observation_space, gym.spaces.Box) + assert self.observation_space.shape[-1] == 1 + + # Change our observation space according to the given stacking settings. + self.observation_space = gym.spaces.Box( + low=np.repeat(self.observation_space.low, repeats=self.num_frames, axis=-1), + high=np.repeat( + self.observation_space.high, repeats=self.num_frames, axis=-1 + ), + shape=list(self.observation_space.shape)[:-1] + [self.num_frames], + dtype=self.observation_space.dtype, + ) + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + input_: Optional[Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + persistent_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # This is a data-in-data-out connector, so we expect `input_` to be a dict + # with: key=column name, e.g. "obs" and value=[data to be processed by + # RLModule]. We will add to `input_` the last n observations. + observations = [] + + # Learner connector pipeline. Episodes have been finalized/numpy'ized. + if self.as_learner_connector: + for episode in episodes: + + def _map_fn(s): + # Squeeze out last dim. + s = np.squeeze(s, axis=-1) + # Calculate new shape and strides + new_shape = (len(episode), self.num_frames) + s.shape[1:] + new_strides = (s.strides[0],) + s.strides + # Create a strided view of the array. + return np.lib.stride_tricks.as_strided( + s, shape=new_shape, strides=new_strides + ) + + # Get all observations from the episode in one np array (except for + # the very last one, which is the final observation not needed for + # learning). + observations.append( + tree.map_structure( + _map_fn, + episode.get_observations( + indices=slice(-self.num_frames + 1, len(episode)), + neg_indices_left_of_zero=True, + fill=0.0, + ), + ) + ) + + # Move stack-dimension to the end and concatenate along batch axis. + input_[SampleBatch.OBS] = tree.map_structure( + lambda *s: np.transpose(np.concatenate(s, axis=0), axes=[0, 2, 3, 1]), + *observations, + ) + + # Env-to-module pipeline. Episodes still operate on lists. + else: + for episode in episodes: + assert not episode.is_finalized + # Get the list of observations to stack. + obs_stack = episode.get_observations( + indices=slice(-self.num_frames, None), + fill=0.0, + ) + # Observation components are (w, h, 1) + # -> stack to (w, h, [num_frames], 1), then squeeze out last dim to get + # (w, h, [num_frames]). + stacked_obs = tree.map_structure( + lambda *s: np.squeeze(np.stack(s, axis=2), axis=-1), + *obs_stack, + ) + observations.append(stacked_obs) + + input_[SampleBatch.OBS] = batch(observations) + + return input_ diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index c0796d8b94a0f..fae1223a83609 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -97,48 +97,25 @@ def action_space(self, value): def __init__( self, *, - input_observation_space: Optional[gym.Space] = None, - input_action_space: Optional[gym.Space] = None, - env: Optional[gym.Env] = None, + input_observation_space: gym.Space, + input_action_space: gym.Space, **kwargs, ): """Initializes a ConnectorV2 instance. Args: - input_observation_space: An optional input observation space. This - is the space coming from a previous connector piece in the + input_observation_space: The input observation space for this connector + piece. This is the space coming from a previous connector piece in the (env-to-module or learner) pipeline or it is directly defined within - the used gym.Env. If None, `env` must be provided. - input_action_space: An optional input action space. This + the used gym.Env. + input_action_space: The input action space for this connector piece. This is the space coming from a previous connector piece in the (module-to-env) pipeline or it is directly defined within the used - gym.Env. If None, `env` must be provided. - env: An optional env object that the connector might need to know about. - Note that normally, env-to-module and module-to-env connectors get this - information at construction time, but learner connectors won't (b/c - Learner objects don't carry an environment object). + gym.Env. **kwargs: Forward API-compatibility kwargs. """ - # Infer spaces from `env` argument if spaces are not explicitly provided. - if input_observation_space is None or input_action_space is None: - if env is None: - raise ValueError( - "`env` argument must be provided if `input_observation_space` or " - "`input_action_space` are None!" - ) - if input_observation_space is None: - input_observation_space = ( - env.single_observation_space if isinstance(env, gym.vector.Env) - else env.observation_space - ) - if input_action_space is None: - input_action_space = ( - env.single_action_space if isinstance(env, gym.vector.Env) - else env.action_space - ) self.input_observation_space = input_observation_space self.input_action_space = input_action_space - self.env = env self._observation_space = None self._action_space = None @@ -157,16 +134,16 @@ def __call__( """Method for transforming input data into output data. Args: + rl_module: An optional RLModule object that the connector might need to know + about. Note that normally, only module-to-env connectors get this + information at construction time, but env-to-module and learner + connectors won't (b/c they get constructed before the RLModule). input_: The input data abiding to `self.input_type` to be transformed by this connector. Transformations might either be done in-place or a new structure may be returned that matches `self.output_type`. episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects, each corresponding to one slot in the vector env. Note that episodes should always be considered read-only and not be altered. - rl_module: An optional RLModule object that the connector might need to know - about. Note that normally, only module-to-env connectors get this - information at construction time, but env-to-module and learner - connectors won't (b/c they get constructed before the RLModule). explore: Whether `explore` is currently on. Per convention, if True, the RLModule's `forward_exploration` method should be called, if False, the EnvRunner should call `forward_inference` instead. diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py index b86c2f9cb002f..c156044aa9213 100644 --- a/rllib/connectors/env_to_module/__init__.py +++ b/rllib/connectors/env_to_module/__init__.py @@ -1,5 +1,9 @@ from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule +from ray.rllib.connectors.env_to_module.env_to_module_pipeline import ( + EnvToModulePipeline, +) __all__ = [ "DefaultEnvToModule", + "EnvToModulePipeline", ] diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py index 8239b5f2c2ebd..f4be1c57c1412 100644 --- a/rllib/connectors/env_to_module/default_env_to_module.py +++ b/rllib/connectors/env_to_module/default_env_to_module.py @@ -46,7 +46,7 @@ def __call__( # Collect all most-recent observations from given episodes. observations = [] for episode in episodes: - observations.append(episode.get_observation(indices=-1)) + observations.append(episode.get_observations(indices=-1)) # Batch all collected observations together. input_[SampleBatch.OBS] = batch(observations) diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index b0f1027799a9f..a3694492f89ba 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -1,49 +1,14 @@ from typing import Any, List, Optional -import gymnasium as gym - -from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.connectors.env_to_module.default_env_to_module import DefaultEnvToModule from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import EpisodeType +from ray.util.annotations import PublicAPI +@PublicAPI(stability="alpha") class EnvToModulePipeline(ConnectorPipelineV2): - def __init__( - self, - *, - connectors: Optional[List[ConnectorV2]] = None, - input_observation_space: Optional[gym.Space], - input_action_space: Optional[gym.Space], - env: Optional[gym.Env] = None, - rl_module: Optional["RLModule"] = None, - **kwargs, - ): - super().__init__( - connectors=connectors, - input_observation_space=input_observation_space, - input_action_space=input_action_space, - env=env, - rl_module=rl_module, - **kwargs, - ) - # Add the default final connector piece for env-to-module pipelines: - # Extracting last obs from episodes and add them to input, iff this has not - # happened in any connector piece in this pipeline before. - if ( - len(self.connectors) == 0 - or type(self.connectors[-1]) is not DefaultEnvToModule - ): - self.append( - DefaultEnvToModule( - input_observation_space=self.observation_space, - input_action_space=self.action_space, - env=env, - ) - ) - @override(ConnectorPipelineV2) def __call__( self, diff --git a/rllib/connectors/env_to_module/frame_stacking.py b/rllib/connectors/env_to_module/frame_stacking.py index c6ac262da0ae7..b05385b6c10e2 100644 --- a/rllib/connectors/env_to_module/frame_stacking.py +++ b/rllib/connectors/env_to_module/frame_stacking.py @@ -1,116 +1,6 @@ from functools import partial -import numpy as np -from typing import Any, List, Optional -import gymnasium as gym - -from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.spaces.space_utils import batch -from ray.rllib.utils.typing import EpisodeType - - -class _FrameStackingConnector(ConnectorV2): - """A connector piece that stacks the previous n observations into one.""" - - def __init__( - self, - *, - # Base class constructor args. - input_observation_space: gym.Space, - input_action_space: gym.Space, - env: Optional[gym.Env] = None, - # Specific framestacking args. - num_frames: int = 1, - as_learner_connector: bool = False, - **kwargs, - ): - """Initializes a _FrameStackingConnector instance. - - Args: - num_frames: The number of observation frames to stack up (into a single - observation) for the RLModule's forward pass. - as_learner_connector: Whether this connector is part of a Learner connector - pipeline, as opposed to a env-to-module pipeline. - """ - super().__init__( - input_observation_space=input_observation_space, - input_action_space=input_action_space, - env=env, - **kwargs, - ) - - self.num_frames = num_frames - self.as_learner_connector = as_learner_connector - - # Some assumptions: Space is box AND last dim (the stacking one) is 1. - assert isinstance(self.observation_space, gym.spaces.Box) - assert self.observation_space.shape[-1] == 1 - - # Change our observation space according to the given stacking settings. - self.observation_space = gym.spaces.Box( - low=np.repeat(self.observation_space.low, repeats=self.num_frames, axis=-1), - high=np.repeat( - self.observation_space.high, repeats=self.num_frames, axis=-1 - ), - shape=list(self.observation_space.shape)[:-1] + [self.num_frames], - dtype=self.observation_space.dtype, - ) - - @override(ConnectorV2) - def __call__( - self, - *, - rl_module: RLModule, - input_: Optional[Any], - episodes: List[EpisodeType], - explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, - **kwargs, - ) -> Any: - # This is a data-in-data-out connector, so we expect `input_` to be a dict - # with: key=column name, e.g. "obs" and value=[data to be processed by - # RLModule]. We will add to `input_` the last n observations. - - obs = [] - for episode in episodes: - - # Learner connector pipeline. Episodes have been finalized/numpy'ized. - if self.as_learner_connector: - # Loop through each timestep in the episode and add the previous n - # observations (based on that timestep) to the batch. - for ts in range(len(episode)): - obs.append( - episode.get_observations( - # Extract n observations from `ts` to `ts - n` - # (excluding `ts - n`). - indices=slice(ts - self.num_frames + 1, ts + 1), - # Make sure negative indices are NOT interpreted as - # "counting from the end", but as absolute indices meaning - # they refer to timesteps before 0 (which is the lookback - # buffer). - neg_indices_left_of_zero=True, - # In case we are at the very beginning of the episode, e.g. - # ts==0, fill the left side with zero-observations. - fill=0.0, - ) - ) - # Env-to-module pipeline. Episodes still operate on lists. - else: - assert not episode.is_finalized - obs.append( - batch( - episode.get_observations( - indices=slice(-self.num_frames + 1, None), - fill=0.0, - ) - ) - ) - - input_[SampleBatch.OBS] = batch(obs) - return input_ +from ray.rllib.connectors.common.frame_stacking import _FrameStackingConnector FrameStackingEnvToModule = partial(_FrameStackingConnector, as_learner_connector=False) diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py index 7f0caea909e29..0f66d2c8ade50 100644 --- a/rllib/connectors/env_to_module/prev_action_prev_reward.py +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -21,7 +21,6 @@ def __init__( # Base class constructor args. input_observation_space: gym.Space, input_action_space: gym.Space, - env: Optional[gym.Env] = None, # Specific prev. r/a args. n_prev_actions: int = 1, n_prev_rewards: int = 1, @@ -42,7 +41,6 @@ def __init__( super().__init__( input_observation_space=input_observation_space, input_action_space=input_action_space, - env=env, **kwargs, ) diff --git a/rllib/connectors/learner/__init__.py b/rllib/connectors/learner/__init__.py index e69de29bb2d1d..dda5851866ebc 100644 --- a/rllib/connectors/learner/__init__.py +++ b/rllib/connectors/learner/__init__.py @@ -0,0 +1,11 @@ +from ray.rllib.connectors.learner.default_learner_connector import ( + DefaultLearnerConnector, +) +from ray.rllib.connectors.learner.learner_connector_pipeline import ( + LearnerConnectorPipeline, +) + +__all__ = [ + "DefaultLearnerConnector", + "LearnerConnectorPipeline", +] diff --git a/rllib/connectors/learner/frame_stacking.py b/rllib/connectors/learner/frame_stacking.py index f53a62bd6a726..9b4a9f53ad613 100644 --- a/rllib/connectors/learner/frame_stacking.py +++ b/rllib/connectors/learner/frame_stacking.py @@ -1,6 +1,6 @@ from functools import partial -from ray.rllib.connectors.env_to_module.frame_stacking import _FrameStackingConnector +from ray.rllib.connectors.common.frame_stacking import _FrameStackingConnector FrameStackingLearner = partial(_FrameStackingConnector, as_learner_connector=True) diff --git a/rllib/connectors/learner/learner_connector_pipeline.py b/rllib/connectors/learner/learner_connector_pipeline.py index 88a1ad49c02d1..225b5a4436e06 100644 --- a/rllib/connectors/learner/learner_connector_pipeline.py +++ b/rllib/connectors/learner/learner_connector_pipeline.py @@ -1,45 +1,5 @@ -from typing import List, Optional - -import gymnasium as gym - -from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 -from ray.rllib.connectors.learner.default_learner_connector import ( - DefaultLearnerConnector, -) -from ray.rllib.core.rl_module.rl_module import RLModule class LearnerConnectorPipeline(ConnectorPipelineV2): - def __init__( - self, - *, - connectors: Optional[List[ConnectorV2]] = None, - input_observation_space: Optional[gym.Space], - input_action_space: Optional[gym.Space], - env: Optional[gym.Env] = None, - rl_module: Optional[RLModule] = None, - **kwargs, - ): - super().__init__( - connectors=connectors, - input_observation_space=input_observation_space, - input_action_space=input_action_space, - env=env, - rl_module=rl_module, - **kwargs, - ) - - # Add the default final connector piece for learner pipelines: - # Making sure that we have - at the minimum - observations and that the data - # is time-ranked (if we have a stateful model) and properly zero-padded. - if ( - len(self.connectors) == 0 - or type(self.connectors[-1]) is not DefaultLearnerConnector - ): - self.append( - DefaultLearnerConnector( - input_observation_space=self.observation_space, - input_action_space=self.action_space, - ) - ) + pass diff --git a/rllib/connectors/module_to_env/__init__.py b/rllib/connectors/module_to_env/__init__.py index e69de29bb2d1d..b7ada36aebdbf 100644 --- a/rllib/connectors/module_to_env/__init__.py +++ b/rllib/connectors/module_to_env/__init__.py @@ -0,0 +1,9 @@ +from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv +from ray.rllib.connectors.module_to_env.module_to_env_pipeline import ( + ModuleToEnvPipeline, +) + +__all__ = [ + "DefaultModuleToEnv", + "ModuleToEnvPipeline", +] diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py index 395225f5d6a64..f27aba4999434 100644 --- a/rllib/connectors/module_to_env/default_module_to_env.py +++ b/rllib/connectors/module_to_env/default_module_to_env.py @@ -8,6 +8,12 @@ from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.spaces.space_utils import ( + clip_action, + get_base_struct_from_space, + unsquash_action, +) from ray.rllib.utils.typing import EpisodeType from ray.util.annotations import PublicAPI @@ -37,6 +43,41 @@ class DefaultModuleToEnv(ConnectorV2): in the input). """ + def __init__( + self, + *, + normalize_actions: bool, + clip_actions: bool, + **kwargs, + ): + """Initializes a DefaultModuleToEnv (connector piece) instance. + + Args: + normalize_actions: If True, actions coming from the RLModule's distribution + (or are directly computed by the RLModule w/o sampling) will + be assumed 0.0 centered with a small stddev (only affecting Box + components) and thus be unsquashed (and clipped, just in case) to the + bounds of the env's action space. For example, if the action space of + the environment is `Box(-2.0, -0.5, (1,))`, the model outputs + mean and stddev as 0.1 and exp(0.2), and we sample an action of 0.9 + from the resulting distribution, then this 0.9 will be unsquashed into + the [-2.0 -0.5] interval. If - after unsquashing - the action still + breaches the action space, it will simply be clipped. + clip_actions: If True, actions coming from the RLModule's distribution + (or are directly computed by the RLModule w/o sampling) will be clipped + such that they fit into the env's action space's bounds. + For example, if the action space of the environment is + `Box(-0.5, 0.5, (1,))`, the model outputs + mean and stddev as 0.1 and exp(0.2), and we sample an action of 0.9 + from the resulting distribution, then this 0.9 will be clipped to 0.5 + to fit into the [-0.5 0.5] interval. + """ + super().__init__(**kwargs) + + self._action_space_struct = get_base_struct_from_space(self.action_space) + self.normalize_actions = normalize_actions + self.clip_actions = clip_actions + @override(ConnectorV2) def __call__( self, @@ -90,20 +131,27 @@ def __call__( f"the '{SampleBatch.ACTION_DIST_INPUTS}' key in it (or both)!" ) actions = action_dist.sample() - input_[SampleBatch.ACTIONS] = actions # For convenience and if possible, compute action logp from distribution # and add to output. if action_dist is not None and SampleBatch.ACTION_LOGP not in input_: - input_[SampleBatch.ACTION_LOGP] = action_dist.logp(actions) + input_[SampleBatch.ACTION_LOGP] = convert_to_numpy( + action_dist.logp(actions) + ) - return input_ + actions = convert_to_numpy(actions) + + # Process actions according to Env's action space bounds, if necessary. + # Normalize actions. + if self.normalize_actions: + actions = unsquash_action(actions, self._action_space_struct) + # Clip actions. + elif self.clip_actions: + actions = clip_action(actions, self._action_space_struct) - # @override(Connector) - # def serialize(self): - # return ClipActions.__name__, None + input_[SampleBatch.ACTIONS] = actions - # @staticmethod - # TODO - # def from_state(ctx: ConnectorContext, params: Any): - # return ClipActions(ctx) + # Convert everything into numpy. + input_ = convert_to_numpy(input_) + + return input_ diff --git a/rllib/connectors/module_to_env/module_to_env_pipeline.py b/rllib/connectors/module_to_env/module_to_env_pipeline.py index 2abcecf439d57..e0a11fdac4a63 100644 --- a/rllib/connectors/module_to_env/module_to_env_pipeline.py +++ b/rllib/connectors/module_to_env/module_to_env_pipeline.py @@ -1,45 +1,5 @@ -from typing import List, Optional - -import gymnasium as gym - -from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2 -from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.connectors.module_to_env.default_module_to_env import DefaultModuleToEnv class ModuleToEnvPipeline(ConnectorPipelineV2): - def __init__( - self, - *, - connectors: Optional[List[ConnectorV2]] = None, - input_observation_space: Optional[gym.Space], - input_action_space: Optional[gym.Space], - env: Optional[gym.Env] = None, - rl_module: Optional[RLModule] = None, - **kwargs, - ): - super().__init__( - connectors=connectors, - input_observation_space=input_observation_space, - input_action_space=input_action_space, - env=env, - rl_module=rl_module, - **kwargs, - ) - - # Add the default final connector piece for env-to-module pipelines: - # Sampling actions from action_dist_inputs and add them to input, iff this has - # not happened in any connector piece in this pipeline before. - if ( - len(self.connectors) == 0 - or type(self.connectors[-1]) is not DefaultModuleToEnv - ): - self.append( - DefaultModuleToEnv( - input_observation_space=self.observation_space, - input_action_space=self.action_space, - env=env, - rl_module=rl_module, - ) - ) + pass diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py index ab45623a5562a..9f5f2fb395fb0 100644 --- a/rllib/examples/connectors/connector_v2_frame_stacking.py +++ b/rllib/examples/connectors/connector_v2_frame_stacking.py @@ -1,6 +1,5 @@ import argparse from functools import partial -import os import gymnasium as gym @@ -27,6 +26,12 @@ default="torch", help="The DL framework specifier.", ) +parser.add_argument( + "--num-gpus", + type=int, + default=0, + help="The number of GPUs (Learner workers) to use.", +) parser.add_argument( "--num-frames", type=int, @@ -43,10 +48,10 @@ "--stop-iters", type=int, default=2000, help="Number of iterations to train." ) parser.add_argument( - "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train." + "--stop-timesteps", type=int, default=2000000, help="Number of timesteps to train." ) parser.add_argument( - "--stop-reward", type=float, default=400.0, help="Reward at which we stop training." + "--stop-reward", type=float, default=20.0, help="Reward at which we stop training." ) @@ -66,7 +71,6 @@ def _make_env_to_module_connector(env): return FrameStackingEnvToModule( input_observation_space=env.single_observation_space, input_action_space=env.single_action_space, - env=env, num_frames=args.num_frames, ) @@ -127,7 +131,11 @@ def _make_learner_connector(input_observation_space, input_action_space): env_runner_cls=SingleAgentEnvRunner, env_to_module_connector=_make_env_to_module_connector, ) - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + .resources( + num_learner_workers=args.num_gpus, + num_gpus_per_learner_worker=1 if args.num_gpus else 0, + num_cpus_for_local_worker=1, + ) .training( # Use our frame stacking learner connector. learner_connector=_make_learner_connector, @@ -137,7 +145,8 @@ def _make_learner_connector(input_observation_space, input_action_space): vf_clip_param=10.0, entropy_coeff=0.01, num_sgd_iter=10, - lr=0.00025, # needs to be adjusted: `lr=0.00025*num_learner_workers` + # Linearly adjust learning rate based on number of GPUs. + lr=0.00015 * (args.num_gpus or 1), grad_clip=100.0, grad_clip_by="global_norm", model={ From 2649e70043c53dba847b46977a848fa075fdd097 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 21 Dec 2023 09:00:41 +0100 Subject: [PATCH 12/15] wip Signed-off-by: sven1977 --- rllib/connectors/common/frame_stacking.py | 12 ++--- rllib/connectors/connector_pipeline_v2.py | 6 +-- rllib/connectors/connector_v2.py | 4 +- .../env_to_module/default_env_to_module.py | 16 +++---- .../env_to_module/env_to_module_pipeline.py | 4 +- .../env_to_module/prev_action_prev_reward.py | 12 ++--- .../learner/default_learner_connector.py | 45 +++++++++---------- .../module_to_env/default_module_to_env.py | 32 +++++++------ 8 files changed, 63 insertions(+), 68 deletions(-) diff --git a/rllib/connectors/common/frame_stacking.py b/rllib/connectors/common/frame_stacking.py index 2f587ee083f1e..b139ee21593fa 100644 --- a/rllib/connectors/common/frame_stacking.py +++ b/rllib/connectors/common/frame_stacking.py @@ -67,15 +67,15 @@ def __call__( self, *, rl_module: RLModule, - input_: Optional[Any], + data: Optional[Any], episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, **kwargs, ) -> Any: - # This is a data-in-data-out connector, so we expect `input_` to be a dict + # This is a data-in-data-out connector, so we expect `data` to be a dict # with: key=column name, e.g. "obs" and value=[data to be processed by - # RLModule]. We will add to `input_` the last n observations. + # RLModule]. We will add to `data` the last n observations. observations = [] # Learner connector pipeline. Episodes have been finalized/numpy'ized. @@ -108,7 +108,7 @@ def _map_fn(s): ) # Move stack-dimension to the end and concatenate along batch axis. - input_[SampleBatch.OBS] = tree.map_structure( + data[SampleBatch.OBS] = tree.map_structure( lambda *s: np.transpose(np.concatenate(s, axis=0), axes=[0, 2, 3, 1]), *observations, ) @@ -131,6 +131,6 @@ def _map_fn(s): ) observations.append(stacked_obs) - input_[SampleBatch.OBS] = batch(observations) + data[SampleBatch.OBS] = batch(observations) - return input_ + return data diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index ce04db32a71b7..75f993efba050 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -33,7 +33,7 @@ def __init__( def __call__( self, rl_module: RLModule, - input_: Any, + data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, @@ -46,13 +46,13 @@ def __call__( """ # Loop through connector pieces and call each one with the output of the # previous one. Thereby, time each connector piece's call. - ret = input_ + ret = data for connector in self.connectors: timer = self.timers[str(connector)] with timer: ret = connector( rl_module=rl_module, - input_=ret, + data=ret, episodes=episodes, explore=explore, persistent_data=persistent_data, diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index fae1223a83609..4e6c08d958f59 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -125,7 +125,7 @@ def __call__( self, *, rl_module: RLModule, - input_: Any, + data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, @@ -138,7 +138,7 @@ def __call__( about. Note that normally, only module-to-env connectors get this information at construction time, but env-to-module and learner connectors won't (b/c they get constructed before the RLModule). - input_: The input data abiding to `self.input_type` to be transformed by + data: The input data abiding to `self.input_type` to be transformed by this connector. Transformations might either be done in-place or a new structure may be returned that matches `self.output_type`. episodes: The list of SingleAgentEpisode or MultiAgentEpisode objects, diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py index f4be1c57c1412..1052ffab4d41b 100644 --- a/rllib/connectors/env_to_module/default_env_to_module.py +++ b/rllib/connectors/env_to_module/default_env_to_module.py @@ -34,7 +34,7 @@ def __call__( self, *, rl_module: RLModule, - input_: Optional[Any] = None, + data: Optional[Any] = None, episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, @@ -42,17 +42,17 @@ def __call__( ) -> Any: # If observations cannot be found in `input`, add the most recent ones (from all # episodes). - if SampleBatch.OBS not in input_: + if SampleBatch.OBS not in data: # Collect all most-recent observations from given episodes. observations = [] for episode in episodes: observations.append(episode.get_observations(indices=-1)) # Batch all collected observations together. - input_[SampleBatch.OBS] = batch(observations) + data[SampleBatch.OBS] = batch(observations) # If our module is stateful: - # - Add the most recent STATE_OUTs to `input_`. - # - Make all data in `input_` have a time rank (T=1). + # - Add the most recent STATE_OUTs to `data`. + # - Make all data in `data` have a time rank (T=1). if rl_module.is_stateful(): # Collect all most recently computed STATE_OUT (or use initial states from # RLModule if at beginning of episode). @@ -71,10 +71,10 @@ def __call__( states.append(state) # Make all other inputs have an additional T=1 axis. - input_ = tree.map_structure(lambda s: np.expand_dims(s, axis=1), input_) + data = tree.map_structure(lambda s: np.expand_dims(s, axis=1), data) # Batch states (from list of individual vector sub-env states). # Note that state ins should NOT have the extra time dimension. - input_[STATE_IN] = batch(states) + data[STATE_IN] = batch(states) - return input_ + return data diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index a3694492f89ba..b2a39b8ecfc25 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -14,7 +14,7 @@ def __call__( self, *, rl_module: RLModule, - input_: Optional[Any] = None, + data: Optional[Any] = None, episodes: List[EpisodeType], explore: bool, persistent_data: Optional[dict] = None, @@ -24,7 +24,7 @@ def __call__( # Might just be empty and to be populated from `episodes`. return super().__call__( rl_module=rl_module, - input_=input_ if input_ is not None else {}, + data=data if data is not None else {}, episodes=episodes, explore=explore, persistent_data=persistent_data, diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py index 0f66d2c8ade50..4c890cd3f5133 100644 --- a/rllib/connectors/env_to_module/prev_action_prev_reward.py +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -53,16 +53,16 @@ def __call__( self, *, rl_module: RLModule, - input_: Optional[Any], + data: Optional[Any], episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, **kwargs, ) -> Any: - # This is a data-in-data-out connector, so we expect `input_` to be a dict + # This is a data-in-data-out connector, so we expect `data` to be a dict # with: key=column name, e.g. "obs" and value=[data to be processed by # RLModule]. We will just extract the most recent rewards and/or most recent - # actions from all episodes and store them inside the `input_` data dict. + # actions from all episodes and store them inside the `data` data dict. prev_a = [] prev_r = [] @@ -122,9 +122,9 @@ def __call__( ) ) - input_[SampleBatch.PREV_ACTIONS] = batch(prev_a) - input_[SampleBatch.PREV_REWARDS] = np.array(prev_r) - return input_ + data[SampleBatch.PREV_ACTIONS] = batch(prev_a) + data[SampleBatch.PREV_REWARDS] = np.array(prev_r) + return data PrevRewardPrevActionEnvToModule = partial( diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py index 4216f4790b5f3..3d8dd6dd9415d 100644 --- a/rllib/connectors/learner/default_learner_connector.py +++ b/rllib/connectors/learner/default_learner_connector.py @@ -32,8 +32,8 @@ class DefaultLearnerConnector(ConnectorV2): will be zero-padded, if necessary. If the user wants to customize their own data under the given keys (e.g. obs, - actions, ...), they can extract from the episodes or recompute from `input_` - their own data and store it in `input_` under those keys. In this case, the default + actions, ...), they can extract from the episodes or recompute from `data` + their own data and store it in `data` under those keys. In this case, the default connector will not change the data under these keys and simply act as a pass-through. """ @@ -43,16 +43,16 @@ def __call__( self, *, rl_module: RLModule, - input_: Any, + data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, **kwargs, ) -> Any: # If episodes are provided, extract the essential data from them, but only if - # respective keys are not present yet in `input_`. + # respective keys are not present yet in `data`. if not episodes: - return input_ + return data # Get the data dicts for all episodes. data_dicts = [episode.get_data_dict() for episode in episodes] @@ -60,10 +60,10 @@ def __call__( state_in = None T = rl_module.config.model_config_dict.get("max_seq_len") - # RLModule is stateful and STATE_IN is not found in `input_` (user's custom + # RLModule is stateful and STATE_IN is not found in `data` (user's custom # connectors have not provided this information yet) -> Perform separate # handling of STATE_OUT/STATE_IN keys: - if rl_module.is_stateful() and STATE_IN not in input_: + if rl_module.is_stateful() and STATE_IN not in data: if T is None: raise ValueError( "You are using a stateful RLModule and are not providing custom " @@ -104,11 +104,11 @@ def __call__( # Concatenate the individual episodes' STATE_INs. state_in = tree.map_structure(lambda *s: np.concatenate(s), *state_ins) - # Before adding anything else to the `input_`, add the time axis to existing + # Before adding anything else to the `data`, add the time axis to existing # data. - input_ = tree.map_structure( + data = tree.map_structure( lambda s: split_and_pad_single_record(s, episodes, T=T), - input_, + data, ) # Set the reduce function for all the data we might still have to extract @@ -125,8 +125,8 @@ def __call__( # episodes along the batch axis (axis=0). reduce_fn = np.concatenate - # Extract all data from the episodes and add to `input_`, if not already in - # `input_`. + # Extract all data from the episodes and add to `data`, if not already in + # `data`. for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, @@ -136,35 +136,32 @@ def __call__( SampleBatch.T, # TODO: remove (normally not needed in train batch) *episodes[0].extra_model_outputs.keys(), ]: - if key not in input_ and key != STATE_OUT: + if key not in data and key != STATE_OUT: # Concatenate everything together (along B-axis=0). - input_[key] = tree.map_structure( + data[key] = tree.map_structure( lambda *s: reduce_fn(s), *[d[key] for d in data_dicts], ) # Handle infos (always lists, not numpy arrays). - if SampleBatch.INFOS not in input_: - input_[SampleBatch.INFOS] = sum( + if SampleBatch.INFOS not in data: + data[SampleBatch.INFOS] = sum( [d[SampleBatch.INFOS] for d in data_dicts], [], ) # Now that all "normal" fields are time-dim'd and zero-padded, add - # the STATE_IN column to `input_`. + # the STATE_IN column to `data`. if rl_module.is_stateful(): - input_[STATE_IN] = state_in + data[STATE_IN] = state_in # Also, create the loss mask (b/c of our now possibly zero-padded data) as - # well as the seq_lens array and add these to `input_` as well. - ( - input_["loss_mask"], - input_[SampleBatch.SEQ_LENS], - ) = create_mask_and_seq_lens( + # well as the seq_lens array and add these to `data` as well. + (data["loss_mask"], data[SampleBatch.SEQ_LENS],) = create_mask_and_seq_lens( episode_lens=[len(episode) for episode in episodes], T=T, ) - return input_ + return data def split_and_pad(episodes_data, T): diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py index f27aba4999434..449316fb6b967 100644 --- a/rllib/connectors/module_to_env/default_module_to_env.py +++ b/rllib/connectors/module_to_env/default_module_to_env.py @@ -25,7 +25,7 @@ class DefaultModuleToEnv(ConnectorV2): If necessary, this connector samples actions, given action dist. inputs and a dist. class. The connector will only sample from the action distribution, if the - SampleBatch.ACTIONS key cannot be found in `input_`. Otherwise, it'll behave + SampleBatch.ACTIONS key cannot be found in `data`. Otherwise, it'll behave as pass through (noop). If SampleBatch.ACTIONS is not present, but SampleBatch.ACTION_DIST_INPUTS are, the connector will create a new action distribution using the RLModule in the connector context and sample from this @@ -83,7 +83,7 @@ def __call__( self, *, rl_module: RLModule, - input_: Any, + data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, persistent_data: Optional[dict] = None, @@ -91,27 +91,27 @@ def __call__( ) -> Any: # Loop through all modules that created some output. - # for mid in input_.keys(): + # for mid in data.keys(): # sa_module = ctx.rl_module.get_module(module_id=mid) # If our RLModule is stateful, remove the T=1 axis from all model outputs # (except the state outs, which never have this extra time axis). if rl_module.is_stateful(): - state = input_.pop(STATE_OUT, None) - input_ = tree.map_structure(lambda s: np.squeeze(s, axis=1), input_) + state = data.pop(STATE_OUT, None) + data = tree.map_structure(lambda s: np.squeeze(s, axis=1), data) if state: - input_[STATE_OUT] = state + data[STATE_OUT] = state # ACTION_DIST_INPUTS field returned by `forward_exploration|inference()` -> # Create a new action distribution object. action_dist = None - if SampleBatch.ACTION_DIST_INPUTS in input_: + if SampleBatch.ACTION_DIST_INPUTS in data: if explore: action_dist_class = rl_module.get_exploration_action_dist_cls() else: action_dist_class = rl_module.get_inference_action_dist_cls() action_dist = action_dist_class.from_logits( - input_[SampleBatch.ACTION_DIST_INPUTS] + data[SampleBatch.ACTION_DIST_INPUTS] ) # TODO (sven): Should this not already be taken care of by RLModule's @@ -120,8 +120,8 @@ def __call__( action_dist = action_dist.to_deterministic() # If `forward_...()` returned actions, use them here as-is. - if SampleBatch.ACTIONS in input_: - actions = input_[SampleBatch.ACTIONS] + if SampleBatch.ACTIONS in data: + actions = data[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: if action_dist is None: @@ -134,10 +134,8 @@ def __call__( # For convenience and if possible, compute action logp from distribution # and add to output. - if action_dist is not None and SampleBatch.ACTION_LOGP not in input_: - input_[SampleBatch.ACTION_LOGP] = convert_to_numpy( - action_dist.logp(actions) - ) + if action_dist is not None and SampleBatch.ACTION_LOGP not in data: + data[SampleBatch.ACTION_LOGP] = convert_to_numpy(action_dist.logp(actions)) actions = convert_to_numpy(actions) @@ -149,9 +147,9 @@ def __call__( elif self.clip_actions: actions = clip_action(actions, self._action_space_struct) - input_[SampleBatch.ACTIONS] = actions + data[SampleBatch.ACTIONS] = actions # Convert everything into numpy. - input_ = convert_to_numpy(input_) + data = convert_to_numpy(data) - return input_ + return data From b58ad312faa1843dd13ca9490024d2fc055752f3 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 21 Dec 2023 09:03:35 +0100 Subject: [PATCH 13/15] wip Signed-off-by: sven1977 --- rllib/connectors/common/frame_stacking.py | 2 +- rllib/connectors/connector_pipeline_v2.py | 4 ++-- rllib/connectors/connector_v2.py | 4 ++-- rllib/connectors/env_to_module/default_env_to_module.py | 2 +- rllib/connectors/env_to_module/env_to_module_pipeline.py | 4 ++-- rllib/connectors/env_to_module/prev_action_prev_reward.py | 2 +- rllib/connectors/learner/default_learner_connector.py | 2 +- rllib/connectors/module_to_env/default_module_to_env.py | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/rllib/connectors/common/frame_stacking.py b/rllib/connectors/common/frame_stacking.py index b139ee21593fa..3b7592b852a35 100644 --- a/rllib/connectors/common/frame_stacking.py +++ b/rllib/connectors/common/frame_stacking.py @@ -70,7 +70,7 @@ def __call__( data: Optional[Any], episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: # This is a data-in-data-out connector, so we expect `data` to be a dict diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 75f993efba050..76e6f952b91f2 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -36,7 +36,7 @@ def __call__( data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: """In a pipeline, we simply call each of our connector pieces after each other. @@ -55,7 +55,7 @@ def __call__( data=ret, episodes=episodes, explore=explore, - persistent_data=persistent_data, + shared_data=shared_data, **kwargs, ) return ret diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index 4e6c08d958f59..a4bad77b39da5 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -128,7 +128,7 @@ def __call__( data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: """Method for transforming input data into output data. @@ -147,7 +147,7 @@ def __call__( explore: Whether `explore` is currently on. Per convention, if True, the RLModule's `forward_exploration` method should be called, if False, the EnvRunner should call `forward_inference` instead. - persistent_data: Optional additional context data that needs to be exchanged + shared_data: Optional additional context data that needs to be exchanged between different Connector pieces and -pipelines. kwargs: Forward API-compatibility kwargs. diff --git a/rllib/connectors/env_to_module/default_env_to_module.py b/rllib/connectors/env_to_module/default_env_to_module.py index 1052ffab4d41b..9a5813403036f 100644 --- a/rllib/connectors/env_to_module/default_env_to_module.py +++ b/rllib/connectors/env_to_module/default_env_to_module.py @@ -37,7 +37,7 @@ def __call__( data: Optional[Any] = None, episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: # If observations cannot be found in `input`, add the most recent ones (from all diff --git a/rllib/connectors/env_to_module/env_to_module_pipeline.py b/rllib/connectors/env_to_module/env_to_module_pipeline.py index b2a39b8ecfc25..5f790ec84a769 100644 --- a/rllib/connectors/env_to_module/env_to_module_pipeline.py +++ b/rllib/connectors/env_to_module/env_to_module_pipeline.py @@ -17,7 +17,7 @@ def __call__( data: Optional[Any] = None, episodes: List[EpisodeType], explore: bool, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ): # Make sure user does not necessarily send initial input into this pipeline. @@ -27,6 +27,6 @@ def __call__( data=data if data is not None else {}, episodes=episodes, explore=explore, - persistent_data=persistent_data, + shared_data=shared_data, **kwargs, ) diff --git a/rllib/connectors/env_to_module/prev_action_prev_reward.py b/rllib/connectors/env_to_module/prev_action_prev_reward.py index 4c890cd3f5133..cae717beee0b1 100644 --- a/rllib/connectors/env_to_module/prev_action_prev_reward.py +++ b/rllib/connectors/env_to_module/prev_action_prev_reward.py @@ -56,7 +56,7 @@ def __call__( data: Optional[Any], episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: # This is a data-in-data-out connector, so we expect `data` to be a dict diff --git a/rllib/connectors/learner/default_learner_connector.py b/rllib/connectors/learner/default_learner_connector.py index 3d8dd6dd9415d..6e17beb82f52a 100644 --- a/rllib/connectors/learner/default_learner_connector.py +++ b/rllib/connectors/learner/default_learner_connector.py @@ -46,7 +46,7 @@ def __call__( data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: # If episodes are provided, extract the essential data from them, but only if diff --git a/rllib/connectors/module_to_env/default_module_to_env.py b/rllib/connectors/module_to_env/default_module_to_env.py index 449316fb6b967..e36b4c4c4771b 100644 --- a/rllib/connectors/module_to_env/default_module_to_env.py +++ b/rllib/connectors/module_to_env/default_module_to_env.py @@ -86,7 +86,7 @@ def __call__( data: Any, episodes: List[EpisodeType], explore: Optional[bool] = None, - persistent_data: Optional[dict] = None, + shared_data: Optional[dict] = None, **kwargs, ) -> Any: From 7bc0ac63b3086b00f853409157ceb52bce9cafd6 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 21 Dec 2023 12:03:10 +0100 Subject: [PATCH 14/15] wip Signed-off-by: sven1977 --- rllib/BUILD | 13 +++++++------ rllib/env/wrappers/atari_wrappers.py | 1 + .../connectors/connector_v2_frame_stacking.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 6bbf72f3cc69a..0623e5455815e 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -781,12 +781,13 @@ py_test( # Tag: connector_v2 # -------------------------------------------------------------------- -py_test( - name = "connectors/tests/test_connector_v2", - tags = ["team:rllib", "connector_v2"], - size = "small", - srcs = ["connectors/tests/test_connector_v2.py"] -) +# TODO (sven): Add these tests in a separate PR. +# py_test( +# name = "connectors/tests/test_connector_v2", +# tags = ["team:rllib", "connector_v2"], +# size = "small", +# srcs = ["connectors/tests/test_connector_v2.py"] +# ) # -------------------------------------------------------------------- # Env tests diff --git a/rllib/env/wrappers/atari_wrappers.py b/rllib/env/wrappers/atari_wrappers.py index 2919685cf6bc5..fb4fa762c819a 100644 --- a/rllib/env/wrappers/atari_wrappers.py +++ b/rllib/env/wrappers/atari_wrappers.py @@ -240,6 +240,7 @@ def reset(self, **kwargs): return self.env.reset(**kwargs) +@PublicAPI class NormalizedImageEnv(gym.ObservationWrapper): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py index 9f5f2fb395fb0..1119c2539bdd3 100644 --- a/rllib/examples/connectors/connector_v2_frame_stacking.py +++ b/rllib/examples/connectors/connector_v2_frame_stacking.py @@ -82,7 +82,7 @@ def _make_learner_connector(input_observation_space, input_action_space): num_frames=args.num_frames, ) - # Create a custom Atari setup (w/o the usual Rllib-hard-coded framestacking in it). + # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it). # We would like our frame stacking connector to do this job. tune.register_env( "env", From f7dde731627098199d1b21699dd941c3a314368c Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 21 Dec 2023 12:22:07 +0100 Subject: [PATCH 15/15] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm.py | 15 ++++----- rllib/algorithms/impala/impala.py | 7 ++-- rllib/algorithms/pg/pg.py | 5 ++- rllib/algorithms/ppo/ppo.py | 20 ++++++------ rllib/algorithms/ppo/tf/ppo_tf_rl_module.py | 20 ++++++------ .../ppo/torch/ppo_torch_rl_module.py | 20 +++++++----- rllib/connectors/connector_pipeline_v2.py | 32 +++++++------------ rllib/core/learner/torch/torch_learner.py | 1 + rllib/core/models/catalog.py | 11 ++++--- rllib/core/models/torch/encoder.py | 2 +- rllib/utils/filter_manager.py | 2 +- rllib/utils/numpy.py | 14 ++------ rllib/utils/tests/test_minibatch_utils.py | 8 ++--- rllib/utils/torch_utils.py | 4 +-- 14 files changed, 74 insertions(+), 87 deletions(-) diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 2d68b89e53ba6..11ba31c794da3 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -564,6 +564,11 @@ def setup(self, config: AlgorithmConfig) -> None: config_obj.env = self._env_id self.config = config_obj + self._uses_new_env_runners = ( + self.config.env_runner_cls is not None + and not issubclass(self.config.env_runner_cls, RolloutWorker) + ) + # Set Algorithm's seed after we have - if necessary - enabled # tf eager-execution. update_global_seed_if_necessary(self.config.framework_str, self.config.seed) @@ -756,9 +761,7 @@ def setup(self, config: AlgorithmConfig) -> None: # Note that with the new EnvRunner API in combination with the new stack, # this information only needs to be kept in the LearnerGroup and not on the # EnvRunners anymore. - if self.config.env_runner_cls is None or issubclass( - self.config.env_runner_cls, RolloutWorker - ): + if not self._uses_new_env_runners: update_fn = self.learner_group.should_module_be_updated_fn self.workers.foreach_worker( lambda w: w.set_is_policy_to_train(update_fn), @@ -3031,11 +3034,7 @@ def _run_one_evaluation( """ eval_func_to_use = ( self._evaluate_async_with_env_runner - if ( - self.config.enable_async_evaluation - and self.config.env_runner_cls is not None - and not issubclass(self.config.env_runner_cls, RolloutWorker) - ) + if (self.config.enable_async_evaluation and self._uses_new_env_runners) else self._evaluate_async if self.config.enable_async_evaluation else self.evaluate diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index fabde3ee8eb4e..0f29ba3939d79 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -86,18 +86,17 @@ class ImpalaConfig(AlgorithmConfig): # Update the config object. config = config.training( - lr=tune.grid_search([0.0001, ]), grad_clip=20.0 + lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0 ) config = config.resources(num_gpus=0) config = config.rollouts(num_rollout_workers=1) # Set the config object's env. config = config.environment(env="CartPole-v1") - # Use to_dict() to get the old-style python config dict - # when running with tune. + # Run with tune. tune.Tuner( "IMPALA", + param_space=config, run_config=air.RunConfig(stop={"training_iteration": 1}), - param_space=config.to_dict(), ).fit() .. testoutput:: diff --git a/rllib/algorithms/pg/pg.py b/rllib/algorithms/pg/pg.py index 390943f8fe143..b5cfa38044053 100644 --- a/rllib/algorithms/pg/pg.py +++ b/rllib/algorithms/pg/pg.py @@ -30,12 +30,11 @@ class PGConfig(AlgorithmConfig): >>> config = config.training(lr=tune.grid_search([0.001, 0.0001])) >>> # Set the config object's env. >>> config = config.environment(env="CartPole-v1") - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. + >>> # Run with tune. >>> tune.Tuner( # doctest: +SKIP ... "PG", ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), + ... param_space=config, ... ).fit() """ diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index c394b96914d83..9f9605312e2e3 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -253,13 +253,10 @@ def training( # Pass kwargs onto super's `training()` method. super().training(**kwargs) - # TODO (sven): Move to generic AlgorithmConfig. - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule if use_critic is not NotProvided: self.use_critic = use_critic - # TODO (Kourosh) This is experimental. Set learner_hps parameters as - # well. Don't forget to remove .use_critic from algorithm config. + # TODO (Kourosh) This is experimental. + # Don't forget to remove .use_critic from algorithm config. if use_gae is not NotProvided: self.use_gae = use_gae if lambda_ is not NotProvided: @@ -280,8 +277,6 @@ def training( self.vf_loss_coeff = vf_loss_coeff if entropy_coeff is not NotProvided: self.entropy_coeff = entropy_coeff - if entropy_coeff_schedule is not NotProvided: - self.entropy_coeff_schedule = entropy_coeff_schedule if clip_param is not NotProvided: self.clip_param = clip_param if vf_clip_param is not NotProvided: @@ -289,6 +284,12 @@ def training( if grad_clip is not NotProvided: self.grad_clip = grad_clip + # TODO (sven): Remove these once new API stack is only option for PPO. + if lr_schedule is not NotProvided: + self.lr_schedule = lr_schedule + if entropy_coeff_schedule is not NotProvided: + self.entropy_coeff_schedule = entropy_coeff_schedule + return self @override(AlgorithmConfig) @@ -312,8 +313,8 @@ def validate(self) -> None: raise ValueError( f"`sgd_minibatch_size` ({self.sgd_minibatch_size}) must be <= " f"`train_batch_size` ({self.train_batch_size}). In PPO, the train batch" - f" is be split into {self.sgd_minibatch_size} chunks, each of which is " - f"iterated over (used for updating the policy) {self.num_sgd_iter} " + f" will be split into {self.sgd_minibatch_size} chunks, each of which " + f"is iterated over (used for updating the policy) {self.num_sgd_iter} " "times." ) @@ -476,7 +477,6 @@ def training_step(self) -> ResultDict: self.workers.local_worker().set_weights(weights) if self.config._enable_new_api_stack: - kl_dict = {} if self.config.use_kl_loss: for pid in policies_to_update: diff --git a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py index 12856f9d0d8c0..2b30c810568da 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py @@ -20,13 +20,15 @@ class PPOTfRLModule(TfRLModule, PPORLModule): def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]: output = {} + # Encoder forward pass. encoder_outs = self.encoder(batch) if STATE_OUT in encoder_outs: output[STATE_OUT] = encoder_outs[STATE_OUT] - # Actions - action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - output[SampleBatch.ACTION_DIST_INPUTS] = action_logits + # Pi head. + output[SampleBatch.ACTION_DIST_INPUTS] = self.pi( + encoder_outs[ENCODER_OUT][ACTOR] + ) return output @@ -34,8 +36,8 @@ def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]: def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: """PPO forward pass during exploration. - Besides the action distribution, this method also returns the parameters of the - policy distribution to be used for computing KL divergence between the old + Besides the action distribution, this method also returns the parameters of + the policy distribution to be used for computing KL divergence between the old policy and the new policy during training. """ output = {} @@ -51,7 +53,6 @@ def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: # Policy head action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - output[SampleBatch.ACTION_DIST_INPUTS] = action_logits return output @@ -60,16 +61,17 @@ def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: def _forward_train(self, batch: NestedDict): output = {} - # Shared encoder + # Shared encoder. encoder_outs = self.encoder(batch) if STATE_OUT in encoder_outs: output[STATE_OUT] = encoder_outs[STATE_OUT] - # Value head + # Value head. vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC]) + # Squeeze out last dim (value function node). output[SampleBatch.VF_PREDS] = tf.squeeze(vf_out, axis=-1) - # Policy head + # Policy head. action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) output[SampleBatch.ACTION_DIST_INPUTS] = action_logits diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py index 09010c872c896..745f45bb603f6 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py @@ -20,21 +20,24 @@ class PPOTorchRLModule(TorchRLModule, PPORLModule): def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]: output = {} + # Encoder forward pass. encoder_outs = self.encoder(batch) if STATE_OUT in encoder_outs: output[STATE_OUT] = encoder_outs[STATE_OUT] - # Actions - action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - output[SampleBatch.ACTION_DIST_INPUTS] = action_logits + # Pi head. + output[SampleBatch.ACTION_DIST_INPUTS] = self.pi( + encoder_outs[ENCODER_OUT][ACTOR] + ) return output @override(RLModule) def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: """PPO forward pass during exploration. - Besides the action distribution, this method also returns the parameters of the - policy distribution to be used for computing KL divergence between the old + + Besides the action distribution, this method also returns the parameters of + the policy distribution to be used for computing KL divergence between the old policy and the new policy during training. """ output = {} @@ -58,16 +61,17 @@ def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: def _forward_train(self, batch: NestedDict) -> Dict[str, Any]: output = {} - # Shared encoder + # Shared encoder. encoder_outs = self.encoder(batch) if STATE_OUT in encoder_outs: output[STATE_OUT] = encoder_outs[STATE_OUT] - # Value head + # Value head. vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC]) + # Squeeze out last dim (value function node). output[SampleBatch.VF_PREDS] = vf_out.squeeze(-1) - # Policy head + # Policy head. action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) output[SampleBatch.ACTION_DIST_INPUTS] = action_logits diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index 76e6f952b91f2..86f0649d66a39 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -184,29 +184,21 @@ def append(self, connector: ConnectorV2) -> None: ) @override(ConnectorV2) - def get_state(self): - children = [] - for c in self.connectors: - state = c.serialize() - assert isinstance(state, tuple) and len(state) == 2, ( - "Serialized connector state must be in the format of " - f"Tuple[name: str, params: Any]. Instead we got {state}" - f"for connector {c.__name__}." - ) - children.append(state) - return ConnectorPipelineV2.__name__, children + def get_state(self) -> Dict[str, Any]: + states = {} + for i, connector in enumerate(self.connectors): + key = f"{i:03d}_{type(connector).__name__}" + state = connector.get_state() + states[key] = state + return states @override(ConnectorV2) def set_state(self, state: Dict[str, Any]) -> None: - connectors = [] - for state in params: - try: - name, subparams = state - connectors.append(get_connector(name, ctx, subparams)) - except Exception as e: - logger.error(f"Failed to de-serialize connector state: {state}") - raise e - return ConnectorPipelineV2(ctx, connectors) + for i, connector in enumerate(self.connectors): + key = f"{i:03d}_{type(connector).__name__}" + if key not in state: + raise KeyError(f"No state found in `state` for connector piece: {key}!") + connector.set_state(state[key]) def __repr__(self, indentation: int = 0): return "\n".join( diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index 6e229b5f299a8..c022909120794 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -215,6 +215,7 @@ def get_parameters(self, module: RLModule) -> Sequence[Param]: @override(Learner) def _convert_batch_type(self, batch: MultiAgentBatch) -> MultiAgentBatch: batch = convert_to_torch_tensor(batch.policy_batches, device=self._device) + # TODO (sven): This computation of `env_steps` is not accurate! length = max(len(b) for b in batch.values()) batch = MultiAgentBatch(batch, env_steps=length) return batch diff --git a/rllib/core/models/catalog.py b/rllib/core/models/catalog.py index b956343babae7..aaf0f1d9fb83d 100644 --- a/rllib/core/models/catalog.py +++ b/rllib/core/models/catalog.py @@ -55,7 +55,6 @@ class Catalog: from ray.rllib.core.models.configs import MLPHeadConfig from ray.rllib.core.models.catalog import Catalog - class MyCatalog(Catalog): def __init__( self, @@ -64,17 +63,19 @@ def __init__( model_config_dict: dict, ): super().__init__(observation_space, action_space, model_config_dict) - self.my_model_config_dict = MLPHeadConfig( + self.my_model_config = MLPHeadConfig( hidden_layer_dims=[64, 32], input_dims=[self.observation_space.shape[0]], ) def build_my_head(self, framework: str): - return self.my_model_config_dict.build(framework=framework) + return self.my_model_config.build(framework=framework) # With that, RLlib can build and use models from this catalog like this: catalog = MyCatalog(gym.spaces.Box(0, 1), gym.spaces.Box(0, 1), {}) - my_head = catalog.build_my_head("torch") + my_head = catalog.build_my_head(framework="torch") + + # Make a call to the built model. out = my_head(torch.Tensor([[1]])) """ @@ -348,7 +349,7 @@ def get_tokenizer_config( ) -> ModelConfig: """Returns a tokenizer config for the given space. - This is useful for recurrent / tranformer models that need to tokenize their + This is useful for recurrent / transformer models that need to tokenize their inputs. By default, RLlib uses the models supported by Catalog out of the box to tokenize. diff --git a/rllib/core/models/torch/encoder.py b/rllib/core/models/torch/encoder.py index dd90c5af02a35..5d5ee38ed8d5b 100644 --- a/rllib/core/models/torch/encoder.py +++ b/rllib/core/models/torch/encoder.py @@ -175,7 +175,7 @@ def __init__(self, config: RecurrentEncoderConfig) -> None: assert len(gru_input_dims) == 1 gru_input_dim = gru_input_dims[0] - # Create the torch LSTM layer. + # Create the torch GRU layer. self.gru = nn.GRU( gru_input_dim, config.hidden_dim, diff --git a/rllib/utils/filter_manager.py b/rllib/utils/filter_manager.py index e4b71af66d09e..8bcba09793421 100644 --- a/rllib/utils/filter_manager.py +++ b/rllib/utils/filter_manager.py @@ -29,7 +29,7 @@ def synchronize( Args: local_filters: Filters to be synchronized. - remotes: Remote evaluators with filters. + worker_set: WorkerSet with remote EnvRunners with filters. update_remote: Whether to push updates from the local filters to the remote workers' filters. timeout_seconds: How long to wait for filter to get or set filters diff --git a/rllib/utils/numpy.py b/rllib/utils/numpy.py index 9f040c8a0c286..944d4b758c8c4 100644 --- a/rllib/utils/numpy.py +++ b/rllib/utils/numpy.py @@ -7,11 +7,7 @@ from ray.rllib.utils.annotations import PublicAPI -from ray.rllib.utils.deprecation import ( - DEPRECATED_VALUE, - deprecation_warning, - Deprecated, -) +from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.typing import SpaceStruct, TensorType, TensorStructType, Union @@ -122,9 +118,7 @@ def concat_aligned( @PublicAPI -def convert_to_numpy( - x: TensorStructType, reduce_type: bool = True, reduce_floats=DEPRECATED_VALUE -): +def convert_to_numpy(x: TensorStructType, reduce_type: bool = True) -> TensorStructType: """Converts values in `stats` to non-Tensor numpy or python types. Args: @@ -139,10 +133,6 @@ def convert_to_numpy( values converted to numpy arrays (on CPU). """ - if reduce_floats != DEPRECATED_VALUE: - deprecation_warning(old="reduce_floats", new="reduce_types", error=True) - reduce_type = reduce_floats - # The mapping function used to numpyize torch/tf Tensors (and move them # to the CPU beforehand). def mapping(item): diff --git a/rllib/utils/tests/test_minibatch_utils.py b/rllib/utils/tests/test_minibatch_utils.py index a8d8180d05129..0256e9ffab311 100644 --- a/rllib/utils/tests/test_minibatch_utils.py +++ b/rllib/utils/tests/test_minibatch_utils.py @@ -93,8 +93,8 @@ def test_minibatch_cyclic_iterator(self): check(policy_batch.count, mini_batch_size) iteration_counter += 1 - # for each policy check that the last item in batch matches the expected - # values, i.e. iteration_counter * mini_batch_size % agent_steps - 1 + # For each policy check that the last item in batch matches the expected + # values, i.e. iteration_counter * mini_batch_size % agent_steps - 1. total_steps = iteration_counter * mini_batch_size for policy_idx, policy_batch in enumerate( batch.policy_batches.values() @@ -104,8 +104,8 @@ def test_minibatch_cyclic_iterator(self): expected_last_item = 0.0 check(policy_batch["obs"][-1], expected_last_item) - # check iteration counter (should be - # ceil(num_gsd_iter * max(agent_steps) / mini_batch_size)) + # Check iteration counter (should be + # ceil(num_gsd_iter * max(agent_steps) / mini_batch_size)). expected_iteration_counter = np.ceil( num_sgd_iter * max(agent_steps) / mini_batch_size ) diff --git a/rllib/utils/torch_utils.py b/rllib/utils/torch_utils.py index 0a56abf83a502..68c8ebda458e3 100644 --- a/rllib/utils/torch_utils.py +++ b/rllib/utils/torch_utils.py @@ -217,8 +217,8 @@ def convert_to_torch_tensor(x: TensorStructType, device: Optional[str] = None): Returns: Any: A new struct with the same structure as `x`, but with all - values converted to torch Tensor types. This does not convert possibly - nested elements that are None because torch has no representation for that. + values converted to torch Tensor types. This does not convert possibly + nested elements that are None because torch has no representation for that. """ def mapping(item):