diff --git a/rllib/BUILD b/rllib/BUILD index 16af480689f2b..5cd99351b97c0 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2086,6 +2086,27 @@ py_test( # tagged by @OldAPIStack and/or @HybridAPIStack # ---------------------- +# subdirectory: actions/ + +# Nested action spaces (flattening obs and learning w/ multi-action distribution). +py_test( + name = "examples/actions/nested_action_spaces_ppo", + main = "examples/actions/nested_action_spaces.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/actions/nested_action_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"] +) + +py_test( + name = "examples/actions/nested_action_spaces_multi_agent_ppo", + main = "examples/actions/nested_action_spaces.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/actions/nested_action_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"] +) + # subdirectory: algorithms/ #@OldAPIStack @@ -2213,41 +2234,22 @@ py_test( args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] ) -# Nested action spaces (flattening obs and learning w/ multi-action distribution). -py_test( - name = "examples/connectors/nested_action_spaces_ppo", - main = "examples/connectors/nested_action_spaces.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/connectors/nested_action_spaces.py"], - args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"] -) - -py_test( - name = "examples/connectors/nested_action_spaces_multi_agent_ppo", - main = "examples/connectors/nested_action_spaces.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/connectors/nested_action_spaces.py"], - args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"] -) - # Nested observation spaces (flattening). py_test( - name = "examples/connectors/nested_observation_spaces_ppo", - main = "examples/connectors/nested_observation_spaces.py", + name = "examples/connectors/flatten_observations_dict_space_ppo", + main = "examples/connectors/flatten_observations_dict_space.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/nested_observation_spaces.py"], + srcs = ["examples/connectors/flatten_observations_dict_space.py"], args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO"] ) py_test( - name = "examples/connectors/nested_observation_spaces_multi_agent_ppo", - main = "examples/connectors/nested_observation_spaces.py", + name = "examples/connectors/flatten_observations_dict_space_multi_agent_ppo", + main = "examples/connectors/flatten_observations_dict_space.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/nested_observation_spaces.py"], + srcs = ["examples/connectors/flatten_observations_dict_space.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--framework=torch", "--algo=PPO"] ) diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index f04376dcd9937..3156c66a7a69d 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -312,7 +312,7 @@ def _fix_spaces(self): obs_space = self.input_observation_space act_space = self.input_action_space for con in self.connectors: - con.input_observation_space = obs_space con.input_action_space = act_space + con.input_observation_space = obs_space obs_space = con.observation_space act_space = con.action_space diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index ad7bd9eed4bb1..e43f7515faeab 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -84,8 +84,9 @@ def __init__( self._action_space = None self._input_observation_space = None self._input_action_space = None - self.input_observation_space = input_observation_space + self.input_action_space = input_action_space + self.input_observation_space = input_observation_space @OverrideToImplementCustomLogic def recompute_observation_space_from_input_spaces(self) -> gym.Space: diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py index 8f2750c9a8075..98b73bd9962bb 100644 --- a/rllib/connectors/env_to_module/__init__.py +++ b/rllib/connectors/env_to_module/__init__.py @@ -14,7 +14,7 @@ FlattenObservations, ) from ray.rllib.connectors.env_to_module.prev_actions_prev_rewards import ( - PrevActionsPrevRewardsConnector, + PrevActionsPrevRewards, ) from ray.rllib.connectors.env_to_module.write_observations_to_episodes import ( WriteObservationsToEpisodes, @@ -29,6 +29,6 @@ "EnvToModulePipeline", "FlattenObservations", "NumpyToTensor", - "PrevActionsPrevRewardsConnector", + "PrevActionsPrevRewards", "WriteObservationsToEpisodes", ] diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index 1958f9e871d17..6a2e60173b65c 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -6,7 +6,6 @@ import tree # pip install dm_tree from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.core.columns import Columns from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.numpy import flatten_inputs_to_1d_tensor @@ -19,18 +18,12 @@ class FlattenObservations(ConnectorV2): """A connector piece that flattens all observation components into a 1D array. - - Only works on data that has already been added to the batch. - - This connector makes the assumption that under the Columns.OBS key in batch, - there is either a list of individual env observations to be flattened (single-agent - case) or a dict mapping agent- and module IDs to lists of data items to be - flattened (multi-agent case). - - Does NOT work in a Learner pipeline as it operates on individual observation - items (as opposed to batched/time-ranked data). - - Therefore, assumes that the altered (flattened) observations will be written - back into the episode by a later connector piece in the env-to-module pipeline - (which this piece is part of as well). - - Does NOT read any information from the given list of Episode objects. - - Does NOT write any observations (or other data) to the given Episode objects. + - Works directly on the incoming episodes list and changes the last observation + in-place (write the flattened observation back into the episode). + - This connector does NOT alter the incoming batch (`data`) when called. + - This connector does NOT work in a `LearnerConnectorPipeline` because it requires + the incoming episodes to still be ongoing (in progress) as it only alters the + latest observation, not all observations in an episode. .. testcode:: @@ -38,6 +31,7 @@ class FlattenObservations(ConnectorV2): import numpy as np from ray.rllib.connectors.env_to_module import FlattenObservations + from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.utils.test_utils import check # Some arbitrarily nested, complex observation space. @@ -51,24 +45,26 @@ class FlattenObservations(ConnectorV2): }) act_space = gym.spaces.Discrete(2) - # A batch of two example items, both coming from the above defined observation - # space. - batch = { - "obs": [ - # 1st example item. + # Two example episodes, both with initial (reset) observations coming from the + # above defined observation space. + episode_1 = SingleAgentEpisode( + observations=[ { "a": np.array(-10.0, np.float32), "b": (1, np.array([[-1.0], [-1.0]], np.float32)), "c": np.array([0, 2]), }, - # 2nd example item. + ], + ) + episode_2 = SingleAgentEpisode( + observations=[ { "a": np.array(10.0, np.float32), "b": (0, np.array([[1.0], [1.0]], np.float32)), "c": np.array([1, 1]), }, ], - } + ) # Construct our connector piece. connector = FlattenObservations(obs_space, act_space) @@ -76,23 +72,23 @@ class FlattenObservations(ConnectorV2): # Call our connector piece with the example data. output_data = connector( rl_module=None, # This connector works without an RLModule. - data=batch, - episodes=[], # This connector does not need the `episodes` input. + data={}, # This connector does not alter any data. + episodes=[episode_1, episode_2], explore=True, shared_data={}, ) - # The connector does not change the number of items in the data (still 2 items). - check(len(output_data["obs"]), 2) + # The connector does not alter the data and acts as pure pass-through. + check(output_data, {}) - # The connector has flattened each item in the data to a 1D tensor. + # The connector has flattened each item in the episodes to a 1D tensor. check( - output_data["obs"][0], + episode_1.get_observations(0), # box() disc(2). box(2, 1). multidisc(2, 3)........ np.array([-10.0, 0.0, 1.0, -1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 1.0]), ) check( - output_data["obs"][1], + episode_2.get_observations(0), # box() disc(2). box(2, 1). multidisc(2, 3)........ np.array([10.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]), ) @@ -169,40 +165,42 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - observations = data.get(Columns.OBS) - - if observations is None: - raise ValueError( - f"`batch` must already have a column named {Columns.OBS} in it " - f"for this connector to work!" - ) - - # Process each item under the Columns.OBS key individually and flatten - # it. We are using the `ConnectorV2.foreach_batch_item_change_in_place` API, - # allowing us to not worry about multi- or single-agent setups and returning - # the new version of each item we are iterating over. - self.foreach_batch_item_change_in_place( - batch=data, - column=Columns.OBS, - func=( - lambda item, eps_id, agent_id, module_id: ( - # Multi-agent AND skip this AgentID. - item - if self._agent_ids and agent_id not in self._agent_ids - # Single-agent or flatten this AgentIDs observation. - else flatten_inputs_to_1d_tensor( - item, + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=True + ): + # Episode is not finalized yet and thus still operates on lists of items. + assert not sa_episode.is_finalized + + last_obs = sa_episode.get_observations(-1) + + if self._multi_agent: + if ( + self._agent_ids is not None + and sa_episode.agent_id not in self._agent_ids + ): + flattened_obs = last_obs + else: + flattened_obs = flatten_inputs_to_1d_tensor( + inputs=last_obs, # In the multi-agent case, we need to use the specific agent's # space struct, not the multi-agent observation space dict. - ( - self._input_obs_base_struct - if not agent_id - else self._input_obs_base_struct[agent_id] - ), - # Our items are bare observations (no batch axis present). + spaces_struct=self._input_obs_base_struct[sa_episode.agent_id], + # Our items are individual observations (no batch axis present). batch_axis=False, ) + else: + flattened_obs = flatten_inputs_to_1d_tensor( + inputs=last_obs, + spaces_struct=self._input_obs_base_struct, + # Our items are individual observations (no batch axis present). + batch_axis=False, ) - ), - ) + + # Write new observation directly back into the episode. + sa_episode.set_observations(at_indices=-1, new_data=flattened_obs) + # We set the Episode's observation space to ours so that we can safely + # set the last obs to the new value (without causing a space mismatch + # error). + sa_episode.observation_space = self.observation_space + return data diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py index e4709aff5b44e..c0bdf8bc65447 100644 --- a/rllib/connectors/env_to_module/mean_std_filter.py +++ b/rllib/connectors/env_to_module/mean_std_filter.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional -from gymnasium.spaces import Discrete, MultiDiscrete import gymnasium as gym +from gymnasium.spaces import Discrete, MultiDiscrete import numpy as np import tree @@ -121,13 +121,10 @@ def __call__( sa_obs, update=self._update_stats ) sa_episode.set_observations(at_indices=-1, new_data=normalized_sa_obs) - - if len(sa_episode) == 0: - # TODO (sven): This is kind of a hack. - # We set the Episode's observation space to ours so that we can safely - # set the last obs to the new value (without causing a space mismatch - # error). - sa_episode.observation_space = self.observation_space + # We set the Episode's observation space to ours so that we can safely + # set the last obs to the new value (without causing a space mismatch + # error). + sa_episode.observation_space = self.observation_space # Leave `data` as is. RLlib's default connector will automatically # populate the OBS column therein from the episodes' now transformed diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py index 5a0222fceb0cd..5b26cd1f8b872 100644 --- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py +++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py @@ -5,14 +5,13 @@ import numpy as np from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.core.columns import Columns from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.spaces.space_utils import batch, flatten_to_single_ndarray from ray.rllib.utils.typing import EpisodeType -class PrevActionsPrevRewardsConnector(ConnectorV2): +class PrevActionsPrevRewards(ConnectorV2): """A connector piece that adds previous rewards and actions to the input obs. - Requires Columns.OBS to be already a part of the batch. @@ -36,13 +35,11 @@ class PrevActionsPrevRewardsConnector(ConnectorV2): """ ORIG_OBS_KEY = "_orig_obs" - PREV_ACTIONS_KEY = "prev_actions" - PREV_REWARDS_KEY = "prev_rewards" + PREV_ACTIONS_KEY = "prev_n_actions" + PREV_REWARDS_KEY = "prev_n_rewards" @override(ConnectorV2) def recompute_observation_space_from_input_spaces(self): - if self.input_action_space is None: - return None if self._multi_agent: ret = {} for agent_id, obs_space in self.input_observation_space.spaces.items(): @@ -64,7 +61,7 @@ def __init__( n_prev_rewards: int = 1, **kwargs, ): - """Initializes a PrevActionsPrevRewardsConnector instance. + """Initializes a PrevActionsPrevRewards instance. Args: multi_agent: Whether this is a connector operating on a multi-agent @@ -108,23 +105,16 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - observations = data.get(Columns.OBS) - - if observations is None: - raise ValueError( - f"`batch` must already have a column named {Columns.OBS} in it " - f"for this connector to work!" - ) - - new_obs = [] - for sa_episode, orig_obs in self.single_agent_episode_iterator( - episodes, zip_with_batch_column=observations + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=True ): # Episode is not finalized yet and thus still operates on lists of items. assert not sa_episode.is_finalized + augmented_obs = {self.ORIG_OBS_KEY: sa_episode.get_observations(-1)} + if self.n_prev_actions: - prev_n_actions = flatten_to_single_ndarray( + augmented_obs[self.PREV_ACTIONS_KEY] = flatten_to_single_ndarray( batch( sa_episode.get_actions( indices=slice(-self.n_prev_actions, None), @@ -135,28 +125,19 @@ def __call__( ) if self.n_prev_rewards: - prev_n_rewards = np.array( + augmented_obs[self.PREV_REWARDS_KEY] = np.array( sa_episode.get_rewards( indices=slice(-self.n_prev_rewards, None), fill=0.0, ) ) - new_obs.append( - { - self.ORIG_OBS_KEY: orig_obs, - self.PREV_ACTIONS_KEY: prev_n_actions, - self.PREV_REWARDS_KEY: prev_n_rewards, - } - ) - - # Convert the observations in the batch into a dict with the keys: - # "_obs", "_prev_rewards", and "_prev_actions". - self.foreach_batch_item_change_in_place( - batch=data, - column=Columns.OBS, - func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0), - ) + # Write new observation directly back into the episode. + sa_episode.set_observations(at_indices=-1, new_data=augmented_obs) + # We set the Episode's observation space to ours so that we can safely + # set the last obs to the new value (without causing a space mismatch + # error). + sa_episode.observation_space = self.observation_space return data diff --git a/rllib/examples/actions/__init__.py b/rllib/examples/actions/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/examples/connectors/nested_action_spaces.py b/rllib/examples/actions/nested_action_spaces.py similarity index 88% rename from rllib/examples/connectors/nested_action_spaces.py rename to rllib/examples/actions/nested_action_spaces.py index 830b87fb25fb0..db7ad434c6743 100644 --- a/rllib/examples/connectors/nested_action_spaces.py +++ b/rllib/examples/actions/nested_action_spaces.py @@ -1,11 +1,7 @@ from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete from ray.tune.registry import register_env -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.examples.envs.classes.multi_agent import ( MultiAgentNestedSpaceRepeatAfterMeEnv, ) @@ -26,13 +22,13 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define env-to-module-connector pipeline for the new stack. def _env_to_module_pipeline(env): - return [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), - ] + return FlattenObservations(multi_agent=args.num_agents > 0) # Register our environment with tune. if args.num_agents > 0: diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py index 0419a8ae1512e..33204e52d5e94 100644 --- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py +++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py @@ -1,16 +1,16 @@ """Example extracting a checkpoint from n trials using one or more custom criteria. This example: -- runs a CartPole experiment with three different learning rates (three tune -"trials"). During the experiment, for each trial, we create a checkpoint at each -iteration. -- at the end of the experiment, we compare the trials and pick the one that performed -best, based on the criterion: Lowest episode count per single iteration (for CartPole, -a low episode count means the episodes are very long and thus the reward is also very -high). -- from that best trial (with the lowest episode count), we then pick those checkpoints -that a) have the lowest policy loss (good) and b) have the highest value function loss -(bad). + - runs a CartPole experiment with three different learning rates (three tune + "trials"). During the experiment, for each trial, we create a checkpoint at each + iteration. + - at the end of the experiment, we compare the trials and pick the one that + performed best, based on the criterion: Lowest episode count per single iteration + (for CartPole, a low episode count means the episodes are very long and thus the + reward is also very high). + - from that best trial (with the lowest episode count), we then pick those + checkpoints that a) have the lowest policy loss (good) and b) have the highest value + function loss (bad). How to run this script diff --git a/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/rllib/examples/checkpoints/continue_training_from_checkpoint.py index a8400659d9604..c52a7868b4e8e 100644 --- a/rllib/examples/checkpoints/continue_training_from_checkpoint.py +++ b/rllib/examples/checkpoints/continue_training_from_checkpoint.py @@ -4,15 +4,16 @@ and you would therefore like to make your setup more robust and fault-tolerant. This example: -- runs a single- or multi-agent CartPole experiment (for multi-agent, we use different -learning rates) thereby checkpointing the state of the Algorithm every n iterations. -- stops the experiment due to an expected crash in the algorithm's main process after -a certain number of iterations. -- just for testing purposes, restores the entire algorithm from the latest checkpoint -and checks, whether the state of the restored algo exactly match the state of the -crashed one. -- then continues training with the restored algorithm until the desired final episode -return is reached. + - runs a single- or multi-agent CartPole experiment (for multi-agent, we use + different learning rates) thereby checkpointing the state of the Algorithm every n + iterations. + - stops the experiment due to an expected crash in the algorithm's main process + after a certain number of iterations. + - just for testing purposes, restores the entire algorithm from the latest + checkpoint and checks, whether the state of the restored algo exactly match the + state of the crashed one. + - then continues training with the restored algorithm until the desired final + episode return is reached. How to run this script diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index fb53e2cb876f1..bf6889113fed3 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -4,12 +4,13 @@ This example: - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` - iterations. - - Stops the experiments after the agents reach a combined return of `-800`. + iterations. + - Stops the experiments after the agents reach a combined return of -800. - Picks the best checkpoint by combined return and restores policy 0 from it. - Runs a second experiment with the restored `RLModule` for policy 0 and a fresh `RLModule` for the other policies. - - Stops the second experiment after the agents reach a combined return of `-800`. + - Stops the second experiment after the agents reach a combined return of -800. + How to run this script ---------------------- @@ -34,6 +35,7 @@ `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` + Results to expect ----------------- You should expect a reward of -400.0 eventually being achieved by a simple diff --git a/rllib/examples/connectors/flatten_observations_dict_space.py b/rllib/examples/connectors/flatten_observations_dict_space.py new file mode 100644 index 0000000000000..bed31ce5ac284 --- /dev/null +++ b/rllib/examples/connectors/flatten_observations_dict_space.py @@ -0,0 +1,157 @@ +"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FlattenObservation` ConnectorV2 piece can be added to the + env-to-module pipeline. + - demonstrates that by using this connector, any arbitrarily nested dict or tuple + observations is properly flattened into a simple 1D tensor, for easier RLModule + processing. + - shows how - in a multi-agent setup - individual agents can be specified, whose + observations should be flattened (while other agents' observations will always + be left as-is). + - uses a variant of the CartPole-v1 environment, in which the 4 observation items + (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict + with the structure: + { + "x-pos": [x-pos], + "angular-pos": { + "value": [angle], + "some_random_stuff": [random Discrete(3)], # <- should be ignored by algo + }, + "velocs": Tuple([x-veloc], [angle-veloc]), + } + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + ++---------------------+------------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+----------------+--------+------------------+ +| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 | 25 | 24.1426 | ++---------------------+------------+----------------+--------+------------------+ +------------------------+------------------------+------------------------+ + num_env_steps_sample | num_env_steps_traine | episode_return_mean | + d_lifetime | d_lifetime | | +------------------------+------------------------+------------------------| + 100000 | 100000 | 421.42 | +------------------------+------------------------+------------------------+ +""" +from ray.tune.registry import register_env +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import ( + CartPoleWithDictObservationSpace, +) +from ray.rllib.examples.envs.classes.multi_agent import ( + MultiAgentCartPoleWithDictObservationSpace, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Define env-to-module-connector pipeline for the new stack. + def _env_to_module_pipeline(env): + return FlattenObservations(multi_agent=args.num_agents > 0) + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda _: MultiAgentCartPoleWithDictObservationSpace( + config={"num_agents": args.num_agents} + ), + ) + else: + register_env("env", lambda _: CartPoleWithDictObservationSpace()) + + # Define the AlgorithmConfig used. + config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .env_runners(env_to_module_connector=_env_to_module_pipeline) + .training( + gamma=0.99, + lr=0.0003, + ) + ) + if args.enable_new_api_stack: + config = config.rl_module( + model_config_dict={ + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "vf_share_layers": True, + "uses_new_env_runners": True, + }, + ) + else: + config = config.training( + model=dict( + fcnet_hiddens=[32], fcnet_activation="linear", vf_share_layers=True + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + config = config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Fix some PPO-specific settings. + if args.algo == "PPO": + config = config.training( + num_sgd_iter=6, + vf_loss_coeff=0.01, + ) + + # Run everything as configured. + run_rllib_example_script_experiment(config, args) diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index 6abce5582b0b8..e26918796ff43 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -1,15 +1,81 @@ -""" Example using connectors (V2) for frame-stacking in Atari environments. +"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + How to run this script ---------------------- -`python [script file name].py --enable-new-api-stack` +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + For debugging, use the following additional command line options `--no-tune --num-env-runners=0` which should allow you to set breakpoints anywhere in the RLlib code and have the execution stop there for inspection and debugging. + For logging to your WandB account, use: `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like this using: +`--env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95` ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 335.837 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 200 iterations is about ~5% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal), +the output looks like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 351.505 | ++---------------------------+------------+--------+------------------+... """ import gymnasium as gym @@ -27,12 +93,8 @@ parser = add_rllib_example_script_args( default_timesteps=5000000, default_reward=20.0, default_iters=200 ) -parser.add_argument( - "--atari-env", - type=str, - default="ALE/Pong-v5", - help="The name of the Atari env to run, e.g. `ALE/Breakout-v5`.", -) +# Use Pong by default. +parser.set_defaults(env="ALE/Pong-v5") parser.add_argument( "--num-frames", type=int, @@ -52,12 +114,16 @@ args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define our custom connector pipelines. def _make_env_to_module_connector(env): # Create the env-to-module connector. We return an individual connector piece - # here, which RLlib will then automatically integrate into a pipeline (and + # here, which RLlib automatically integrates into a pipeline (and # add its default connector piece to the end of that pipeline). - # This pipeline also automatically fixes the input- and output spaces of the + # The default pipeline automatically fixes the input- and output spaces of the # individual connector pieces in it. # Note that since the frame stacking connector does NOT write information # back to the episode (in order to save memory and network traffic), we @@ -79,29 +145,29 @@ def _make_learner_connector(input_observation_space, input_action_space): # We would like our frame stacking connector to do this job. def _env_creator(cfg): return wrap_atari_for_new_api_stack( - gym.make(args.atari_env, **cfg, **{"render_mode": "rgb_array"}), + gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}), # Perform framestacking either through ConnectorV2 or right here through # the observation wrapper. framestack=( - args.num_framestack if args.use_gym_wrapper_framestacking else None + args.num_frames if args.use_gym_wrapper_framestacking else None ), ) if args.num_agents > 0: tune.register_env( - "env", + "atari-env", lambda cfg: make_multi_agent(_env_creator)( dict(cfg, **{"num_agents": args.num_agents}) ), ) else: - tune.register_env("env", _env_creator) + tune.register_env("atari-env", _env_creator) base_config = ( get_trainable_cls(args.algo) .get_default_config() .environment( - "env", + "atari-env", env_config={ # Make analogous to old v4 + NoFrameskip. "frameskip": 1, @@ -135,9 +201,7 @@ def _env_creator(cfg): grad_clip=100.0, grad_clip_by="global_norm", ) - ) - if args.enable_new_api_stack: - base_config.rl_module( + .rl_module( model_config_dict=dict( { "vf_share_layers": True, @@ -148,16 +212,7 @@ def _env_creator(cfg): }, ) ) - else: - base_config.training( - model={ - "vf_share_layers": True, - "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], - "conv_activation": "relu", - "post_fcnet_hiddens": [256], - "uses_new_env_runners": False, - } - ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py index a30d6e399c00c..470812585138b 100644 --- a/rllib/examples/connectors/mean_std_filtering.py +++ b/rllib/examples/connectors/mean_std_filtering.py @@ -1,13 +1,81 @@ -from ray.air.constants import TRAINING_ITERATION +"""Example using a ConnectorV2 for processing observations with a mean/std filter. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module + pipeline. + - demonstrates that using such a filter enhances learning behavior (or even makes + if possible to learn overall) in some environments, especially those with lopsided + observation spaces, for example `Box(-3000, -1000, ...)`. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +Running this example with the mean-std filter results in the normally expected Pendulum +learning behavior: ++-------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-------------------------------+------------+-----------------+--------+ +| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 | 77 | ++-------------------------------+------------+-----------------+--------+ ++------------------+------------------------+-----------------------+ +| total time (s) | num_env_steps_sample | episode_return_mean | +| | d_lifetime | | +|------------------+------------------------+-----------------------| +| 30.7466 | 40040 | -276.3 | ++------------------+------------------------+-----------------------+ + +If you try using the `--disable-mean-std-filter` (all other things being equal), you +will either see no learning progress at all (or a very slow one), but more likely some +numerical instability related error will be thrown: + +ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution + Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the + constraint Real(), but found invalid values: +tensor([[nan], + [nan], + [nan], + ... +""" +import gymnasium as gym +import numpy as np + from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, run_rllib_example_script_experiment, @@ -21,22 +89,43 @@ default_timesteps=500000, default_reward=-300.0, ) +parser.add_argument( + "--disable-mean-std-filter", + action="store_true", + help="Run w/o a mean/std env-to-module connector piece (filter).", +) + + +class LopsidedObs(gym.ObservationWrapper): + def __init__(self, env): + super().__init__(env) + self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32) + + def observation(self, observation): + # Lopside [-1.0, 1.0] Pendulum observations + return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0 if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Register our environment with tune. if args.num_agents > 0: register_env( - "env", + "lopsided-pend", lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), ) + else: + register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1"))) config = ( get_trainable_cls(args.algo) .get_default_config() - .environment("env" if args.num_agents > 0 else "Pendulum-v1") + .environment("lopsided-pend") .env_runners( # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's @@ -48,7 +137,9 @@ # included in an automatically generated EnvToModulePipeline or return a # EnvToModulePipeline directly. env_to_module_connector=( - lambda env: MeanStdFilter(multi_agent=args.num_agents > 0) + None + if args.disable_mean_std_filter + else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0) ), ) .training( @@ -61,25 +152,7 @@ vf_clip_param=10.0, vf_loss_coeff=0.01, ) - .evaluation( - evaluation_num_env_runners=1, - evaluation_parallel_to_training=True, - evaluation_interval=1, - evaluation_duration=10, - evaluation_duration_unit="episodes", - evaluation_config={ - "explore": False, - # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before - # each round of evaluation, broadcast the latest training - # EnvRunnerGroup's ConnectorV2 states (merged from all training remote - # EnvRunners) to the eval EnvRunnerGroup (and discard the eval - # EnvRunners' stats). - "use_worker_filter_stats": False, - }, - ) - ) - if args.enable_new_api_stack: - config = config.rl_module( + .rl_module( model_config_dict={ "fcnet_activation": "relu", "fcnet_weights_initializer": torch.nn.init.xavier_uniform_, @@ -88,17 +161,27 @@ "uses_new_env_runners": True, } ) - else: - config = config.training( - model=dict( - { - "fcnet_activation": "relu", - "fcnet_weights_initializer": torch.nn.init.xavier_uniform_, - "fcnet_bias_initializer": torch.nn.init.constant_, - "fcnet_bias_initializer_config": {"val": 0.0}, - } - ) - ) + # In case you would like to run with a evaluation EnvRunners, make sure your + # `evaluation_config` key contains the `use_worker_filter_stats=False` setting + # (see below). This setting makes sure that the mean/std stats collected by the + # evaluation EnvRunners are NOT used for the training EnvRunners (unless you + # really want to mix these stats). It's normally a good idea to keep the stats + # collected during evaluation completely out of the training data (already for + # better reproducibility alone). + # .evaluation( + # evaluation_num_env_runners=1, + # evaluation_interval=1, + # evaluation_config={ + # "explore": False, + # # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before + # # each round of evaluation, broadcast the latest training + # # EnvRunnerGroup's ConnectorV2 states (merged from all training remote + # # EnvRunners) to the eval EnvRunnerGroup (and discard the eval + # # EnvRunners' stats). + # "use_worker_filter_stats": False, + # }, + # ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: @@ -107,12 +190,4 @@ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) - stop = { - TRAINING_ITERATION: args.stop_iters, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": ( - args.stop_reward - ), - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - } - - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/examples/connectors/nested_observation_spaces.py b/rllib/examples/connectors/nested_observation_spaces.py deleted file mode 100644 index 39a4bac1c585e..0000000000000 --- a/rllib/examples/connectors/nested_observation_spaces.py +++ /dev/null @@ -1,88 +0,0 @@ -from ray.tune.registry import register_env -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) -from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import ( - CartPoleWithDictObservationSpace, -) -from ray.rllib.examples.envs.classes.multi_agent import ( - MultiAgentCartPoleWithDictObservationSpace, -) -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls - - -# Read in common example script command line arguments. -parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0) - - -if __name__ == "__main__": - args = parser.parse_args() - - # Define env-to-module-connector pipeline for the new stack. - def _env_to_module_pipeline(env): - return [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), - ] - - # Register our environment with tune. - if args.num_agents > 0: - register_env( - "env", - lambda _: MultiAgentCartPoleWithDictObservationSpace( - config={"num_agents": args.num_agents} - ), - ) - else: - register_env("env", lambda _: CartPoleWithDictObservationSpace()) - - # Define the AlgorithmConfig used. - config = ( - get_trainable_cls(args.algo) - .get_default_config() - .environment("env") - .env_runners(env_to_module_connector=_env_to_module_pipeline) - .training( - gamma=0.99, - lr=0.0003, - ) - ) - if args.enable_new_api_stack: - config = config.rl_module( - model_config_dict={ - "fcnet_hiddens": [32], - "fcnet_activation": "linear", - "vf_share_layers": True, - "uses_new_env_runners": True, - }, - ) - else: - config = config.training( - model=dict( - fcnet_hiddens=[32], fcnet_activation="linear", vf_share_layers=True - ) - ) - - # Add a simple multi-agent setup. - if args.num_agents > 0: - config = config.multi_agent( - policies={f"p{i}" for i in range(args.num_agents)}, - policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", - ) - - # Fix some PPO-specific settings. - if args.algo == "PPO": - config = config.training( - num_sgd_iter=6, - vf_loss_coeff=0.01, - ) - - # Run everything as configured. - run_rllib_example_script_experiment(config, args) diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index 0c3a2693cca27..dcee6ac5689eb 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -1,11 +1,89 @@ +"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the + env-to-module pipeline to extract previous rewards and/or actions from the ongoing + episodes. + - shows how this connector creates and wraps this new information (rewards and + actions) together with the original observations into the RLModule's input dict + under a new `gym.spaces.Dict` structure (for example, if your observation space + is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation + space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`. + - demonstrates how to use RLlib's `FlattenObservations` right after the + `PrevActionsPrevRewards` to flatten that new dict observation structure again into + a single 1D tensor. + - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing + both x-veloc and angle-veloc observation components and is therefore non-Markovian + (only partially observable). An LSTM default model is used for training. Adding + the additional context to the observations (for example, prev. actions) helps the + LSTM to more quickly learn in this environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +You should see something similar to this in your terminal output when running +ths script as described above: + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 | 17 | 42.6898 | ++---------------------+------------+-----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | episode_return_mean | +| d_lifetime | d_lifetime | | +|------------------------+------------------------+------------------------| +| 68000 | 68000 | 205.22 | ++------------------------+------------------------+------------------------+ +""" import functools from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, FlattenObservations, - PrevActionsPrevRewardsConnector, - WriteObservationsToEpisodes, + PrevActionsPrevRewards, ) from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole @@ -29,18 +107,22 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define our custom connector pipelines. def _env_to_module(env): # Create the env-to-module connector pipeline. return [ - AddObservationsFromEpisodesToBatch(), - PrevActionsPrevRewardsConnector( + # AddObservationsFromEpisodesToBatch(), + PrevActionsPrevRewards( multi_agent=args.num_agents > 0, n_prev_rewards=args.n_prev_rewards, n_prev_actions=args.n_prev_actions, ), FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), + # WriteObservationsToEpisodes(), ] # Register our environment with tune. @@ -64,10 +146,7 @@ def _env_to_module(env): train_batch_size=4000, vf_loss_coeff=0.01, ) - ) - - if args.enable_new_api_stack: - config = config.rl_module( + .rl_module( model_config_dict={ "use_lstm": True, "max_seq_len": 50, @@ -79,20 +158,7 @@ def _env_to_module(env): "uses_new_env_runners": True, } ) - else: - config = config.training( - model=dict( - { - "use_lstm": True, - "max_seq_len": 50, - "fcnet_hiddens": [32], - "fcnet_activation": "linear", - "vf_share_layers": True, - "fcnet_weights_initializer": nn.init.xavier_uniform_, - "fcnet_bias_initializer": functools.partial(nn.init.constant_, 0.0), - } - ) - ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 1e7ba0250ae08..f6b4e1ab7cf2b 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -59,11 +59,7 @@ from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, @@ -222,11 +218,7 @@ def on_train_result( ) .env_runners( num_envs_per_env_runner=5, - env_to_module_connector=lambda env: [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(), - WriteObservationsToEpisodes(), - ], + env_to_module_connector=lambda env: FlattenObservations(), ) ) diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py index 0f61f4519cd7f..2525d5ca29354 100644 --- a/rllib/examples/inference/policy_inference_after_training.py +++ b/rllib/examples/inference/policy_inference_after_training.py @@ -4,13 +4,13 @@ from a checkpoint and a manual env-loop (CartPole-v1). No ConnectorV2s or EnvRunners are used in this example. -This example shows .. - - .. how to use an already existing checkpoint to extract a single-agent RLModule - from (our policy network). - - .. how to setup this recovered policy net for action computations (with or without - using exploration). - - .. have the policy run through a very simple gymnasium based env-loop, w/o using - RLlib's ConnectorV2s or EnvRunners. +This example: + - shows how to use an already existing checkpoint to extract a single-agent RLModule + from (our policy network). + - shows how to setup this recovered policy net for action computations (with or + without using exploration). + - shows have the policy run through a very simple gymnasium based env-loop, w/o + using RLlib's ConnectorV2s or EnvRunners. How to run this script diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py index 6d97ef61f8657..e4a66ec332660 100644 --- a/rllib/examples/inference/policy_inference_after_training_w_connector.py +++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py @@ -6,14 +6,14 @@ The RLModule contains an LSTM that requires its own previous STATE_OUT as new input at every episode step to compute a new action. -This example shows .. - - .. how to use an already existing checkpoint to extract a single-agent RLModule - from (our policy network). - - .. how to setup this recovered policy net for action computations (with or without - using exploration). - - .. how to create a more complex env-loop in which the action-computing RLModule - requires its own previous state outputs as new input and how to use RLlib's Episode - APIs to achieve this. +This example: + - shows how to use an already existing checkpoint to extract a single-agent RLModule + from (our policy network). + - shows how to setup this recovered policy net for action computations (with or + without using exploration). + - shows how to create a more complex env-loop in which the action-computing RLModule + requires its own previous state outputs as new input and how to use RLlib's Episode + APIs to achieve this. How to run this script diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py index d503e7f23ad3d..1f7ad8dc238c5 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py @@ -33,11 +33,7 @@ from pettingzoo.classic import rps_v2 from ray.air.constants import TRAINING_ITERATION -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv @@ -89,10 +85,8 @@ .environment("RockPaperScissors") .env_runners( env_to_module_connector=lambda env: ( - AddObservationsFromEpisodesToBatch(), - # Only flatten obs for the learning RLModul + # `agent_ids=...`: Only flatten obs for the learning RLModule. FlattenObservations(multi_agent=True, agent_ids={"player_0"}), - WriteObservationsToEpisodes(), ), ) .multi_agent( diff --git a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py index 507c018babc8f..e3e75c9906924 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py @@ -15,11 +15,7 @@ from pettingzoo.classic import rps_v2 -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv @@ -62,11 +58,7 @@ .get_default_config() .environment("RockPaperScissors") .env_runners( - env_to_module_connector=lambda env: ( - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=True), - WriteObservationsToEpisodes(), - ), + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), ) .multi_agent( policies={"p0", "p1"}, diff --git a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py index afabd3fe90036..2c94358222905 100644 --- a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py +++ b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py @@ -40,11 +40,7 @@ +------------------+-------+-------------------+-------------+ """ -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.examples.envs.classes.two_step_game import TwoStepGameWithGroupedAgents @@ -76,11 +72,7 @@ .get_default_config() .environment("grouped_twostep") .env_runners( - env_to_module_connector=lambda env: ( - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=True), - WriteObservationsToEpisodes(), - ), + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), ) .multi_agent( policies={"p0"},