From cb0554025cffe646003b70e3661d6de60d0c07cd Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Tue, 2 Apr 2024 20:33:55 +0200 Subject: [PATCH] [RLlib] Cleanup examples folder #01. (#44067) --- .buildkite/rllib.rayci.yml | 11 - doc/source/ray-overview/getting-started.md | 2 +- .../source/rllib/doc_code}/custom_gym_env.py | 0 .../rllib/doc_code}/replay_buffer_demo.py | 0 .../rllib/doc_code}/rllib_on_ray_readme.py | 0 .../saving_and_loading_algos_and_policies.py | 0 doc/source/rllib/package_ref/env.rst | 6 +- doc/source/rllib/rllib-connector.rst | 4 +- doc/source/rllib/rllib-examples.rst | 56 +- doc/source/rllib/rllib-replay-buffers.rst | 10 +- ...-saving-and-loading-algos-and-policies.rst | 26 +- rllib/BUILD | 1174 ++++++++--------- rllib/algorithms/algorithm.py | 11 +- rllib/algorithms/algorithm_config.py | 21 +- .../add_states_from_episodes_to_batch.py | 31 +- .../common/agent_to_module_mapping.py | 49 +- rllib/connectors/connector_v2.py | 115 +- .../env_to_module/flatten_observations.py | 61 +- .../normalize_and_clip_actions.py | 2 +- rllib/core/rl_module/marl_module.py | 27 +- rllib/core/rl_module/rl_module.py | 31 +- .../rl_module/tests/test_rl_module_specs.py | 4 +- rllib/env/multi_agent_env.py | 3 +- rllib/env/wrappers/pettingzoo_env.py | 20 +- .../env/wrappers/tests/test_kaggle_wrapper.py | 68 - .../env/wrappers/tests/test_recsim_wrapper.py | 46 - .../{env/tests => _old_api_stack}/__init__.py | 0 .../attention_net_supervised.py | 76 ++ .../_old_api_stack/complex_struct_space.py | 57 + .../connectors}/adapt_connector_policy.py | 2 +- .../connectors}/prepare_checkpoint.py | 0 .../connectors}/run_connector_policy.py | 2 +- .../self_play_with_policy_checkpoint.py | 2 +- .../_old_api_stack/custom_keras_model.py | 153 +++ .../parametric_actions_cartpole.py | 110 ++ ...ons_cartpole_embeddings_learnt_by_model.py | 98 ++ .../remote_base_env_with_custom_api.py | 147 +++ ...e_envs_with_inference_done_on_main_node.py | 179 +++ .../_old_api_stack/sb2rllib_rllib_example.py | 50 + .../_old_api_stack/sb2rllib_sb_example.py | 40 + .../_old_api_stack/two_trainer_workflow.py | 219 +++ rllib/examples/attention_net_supervised.py | 80 +- rllib/examples/centralized_critic.py | 7 +- rllib/examples/centralized_critic_2.py | 6 + rllib/examples/complex_struct_space.py | 59 +- .../connectors/connector_v2_frame_stacking.py | 27 +- .../connector_v2_mean_std_filtering.py | 25 +- .../connector_v2_nested_action_spaces.py | 94 ++ .../connector_v2_nested_observation_spaces.py | 33 +- .../connector_v2_prev_actions_prev_rewards.py | 30 +- rllib/examples/custom_eval.py | 207 +-- rllib/examples/custom_keras_model.py | 157 +-- rllib/examples/env/simple_corridor.py | 29 +- .../env/tests/test_cliff_walking_wall_env.py | 61 - .../test_coin_game_non_vectorized_env.py | 882 ------------- .../tests/test_coin_game_vectorized_env.py | 906 ------------- .../test_matrix_sequential_social_dilemma.py | 322 ----- rllib/examples/env/tests/test_wrappers.py | 58 - rllib/examples/env/two_step_game.py | 9 +- .../examples/evaluation/custom_evaluation.py | 144 ++ .../evaluation_parallel_to_training.py | 2 +- rllib/examples/gpu_training/__init__.py | 0 .../learner/multi_agent_cartpole_ppo.py | 120 -- .../multi_agent_and_self_play/__init__.py | 22 - .../custom_heuristic_policy.py | 100 ++ .../different_spaces_for_agents.py | 116 ++ .../multi_agent_cartpole.py | 66 + .../pettingzoo_independent_learning.py | 108 ++ .../pettingzoo_parameter_sharing.py | 105 ++ .../pettingzoo_shared_value_function.py | 7 + ...rock_paper_scissors_heuristic_vs_learnt.py | 147 +++ .../rock_paper_scissors_learnt_vs_learnt.py | 96 ++ .../self_play_league_based_with_open_spiel.py | 288 ++++ .../self_play_with_open_spiel.py | 241 ++++ .../two_step_game_with_grouped_agents.py | 98 ++ .../multi_agent_and_self_play/utils.py | 23 - .../utils/__init__.py | 45 + .../{ => utils}/self_play_callback.py | 0 .../self_play_callback_old_api_stack.py | 0 .../self_play_league_based_callback.py | 0 ...lay_league_based_callback_old_api_stack.py | 0 rllib/examples/multi_agent_cartpole.py | 130 +- rllib/examples/multi_agent_custom_policy.py | 119 +- ...multi_agent_different_spaces_for_agents.py | 167 +-- .../multi_agent_independent_learning.py | 64 +- .../examples/multi_agent_parameter_sharing.py | 59 +- rllib/examples/nested_action_spaces.py | 118 +- rllib/examples/parametric_actions_cartpole.py | 112 +- ...ons_cartpole_embeddings_learnt_by_model.py | 101 +- .../remote_base_env_with_custom_api.py | 149 +-- ...e_envs_with_inference_done_on_main_node.py | 181 +-- rllib/examples/rl_module/classes/__init__.py | 10 + .../rock_paper_scissors_heuristic_rlm.py | 108 ++ .../rock_paper_scissors_multiagent.py | 220 +-- rllib/examples/sb2rllib_rllib_example.py | 52 +- rllib/examples/sb2rllib_sb_example.py | 42 +- .../self_play_league_based_with_open_spiel.py | 290 +--- rllib/examples/self_play_with_open_spiel.py | 243 +--- rllib/examples/two_step_game.py | 128 +- rllib/examples/two_trainer_workflow.py | 221 +--- rllib/tests/test_dnc.py | 83 -- rllib/tests/test_nested_action_spaces.py | 142 -- rllib/tests/test_perf.py | 46 - rllib/utils/test_utils.py | 71 +- 104 files changed, 3973 insertions(+), 6516 deletions(-) rename {rllib/examples/documentation => doc/source/rllib/doc_code}/custom_gym_env.py (100%) rename {rllib/examples/documentation => doc/source/rllib/doc_code}/replay_buffer_demo.py (100%) rename {rllib/examples/documentation => doc/source/rllib/doc_code}/rllib_on_ray_readme.py (100%) rename {rllib/examples/documentation => doc/source/rllib/doc_code}/saving_and_loading_algos_and_policies.py (100%) delete mode 100644 rllib/env/wrappers/tests/test_kaggle_wrapper.py delete mode 100644 rllib/env/wrappers/tests/test_recsim_wrapper.py rename rllib/examples/{env/tests => _old_api_stack}/__init__.py (100%) create mode 100644 rllib/examples/_old_api_stack/attention_net_supervised.py create mode 100644 rllib/examples/_old_api_stack/complex_struct_space.py rename rllib/examples/{connectors/v1 => _old_api_stack/connectors}/adapt_connector_policy.py (98%) rename rllib/examples/{connectors/v1 => _old_api_stack/connectors}/prepare_checkpoint.py (100%) rename rllib/examples/{connectors/v1 => _old_api_stack/connectors}/run_connector_policy.py (95%) rename rllib/examples/{connectors/v1 => _old_api_stack/connectors}/self_play_with_policy_checkpoint.py (98%) create mode 100644 rllib/examples/_old_api_stack/custom_keras_model.py create mode 100644 rllib/examples/_old_api_stack/parametric_actions_cartpole.py create mode 100644 rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py create mode 100644 rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py create mode 100644 rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py create mode 100644 rllib/examples/_old_api_stack/sb2rllib_rllib_example.py create mode 100644 rllib/examples/_old_api_stack/sb2rllib_sb_example.py create mode 100644 rllib/examples/_old_api_stack/two_trainer_workflow.py create mode 100644 rllib/examples/connectors/connector_v2_nested_action_spaces.py delete mode 100644 rllib/examples/env/tests/test_cliff_walking_wall_env.py delete mode 100644 rllib/examples/env/tests/test_coin_game_non_vectorized_env.py delete mode 100644 rllib/examples/env/tests/test_coin_game_vectorized_env.py delete mode 100644 rllib/examples/env/tests/test_matrix_sequential_social_dilemma.py delete mode 100644 rllib/examples/env/tests/test_wrappers.py create mode 100644 rllib/examples/evaluation/custom_evaluation.py create mode 100644 rllib/examples/gpu_training/__init__.py delete mode 100644 rllib/examples/learner/multi_agent_cartpole_ppo.py create mode 100644 rllib/examples/multi_agent_and_self_play/custom_heuristic_policy.py create mode 100644 rllib/examples/multi_agent_and_self_play/different_spaces_for_agents.py create mode 100644 rllib/examples/multi_agent_and_self_play/multi_agent_cartpole.py create mode 100644 rllib/examples/multi_agent_and_self_play/pettingzoo_independent_learning.py create mode 100644 rllib/examples/multi_agent_and_self_play/pettingzoo_parameter_sharing.py create mode 100644 rllib/examples/multi_agent_and_self_play/pettingzoo_shared_value_function.py create mode 100644 rllib/examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py create mode 100644 rllib/examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt.py create mode 100644 rllib/examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel.py create mode 100644 rllib/examples/multi_agent_and_self_play/self_play_with_open_spiel.py create mode 100644 rllib/examples/multi_agent_and_self_play/two_step_game_with_grouped_agents.py delete mode 100644 rllib/examples/multi_agent_and_self_play/utils.py create mode 100644 rllib/examples/multi_agent_and_self_play/utils/__init__.py rename rllib/examples/multi_agent_and_self_play/{ => utils}/self_play_callback.py (100%) rename rllib/examples/multi_agent_and_self_play/{ => utils}/self_play_callback_old_api_stack.py (100%) rename rllib/examples/multi_agent_and_self_play/{ => utils}/self_play_league_based_callback.py (100%) rename rllib/examples/multi_agent_and_self_play/{ => utils}/self_play_league_based_callback_old_api_stack.py (100%) create mode 100644 rllib/examples/rl_module/classes/__init__.py create mode 100644 rllib/examples/rl_module/classes/rock_paper_scissors_heuristic_rlm.py delete mode 100644 rllib/tests/test_dnc.py delete mode 100644 rllib/tests/test_nested_action_spaces.py delete mode 100644 rllib/tests/test_perf.py diff --git a/.buildkite/rllib.rayci.yml b/.buildkite/rllib.rayci.yml index 00548a3233485..cccc4eb600a13 100644 --- a/.buildkite/rllib.rayci.yml +++ b/.buildkite/rllib.rayci.yml @@ -106,17 +106,6 @@ steps: --test-env=RLLIB_NUM_GPUS=1 depends_on: rllibgpubuild - - label: ":brain: rllib: rlmodule tests" - tags: rllib_directly - instance_type: large - commands: - - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib - --parallelism-per-worker 3 - --only-tags rlm - --test-env RLLIB_ENABLE_RL_MODULE=1 - --test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - depends_on: rllibbuild - - label: ":brain: rllib: data tests" if: build.branch != "master" tags: data diff --git a/doc/source/ray-overview/getting-started.md b/doc/source/ray-overview/getting-started.md index af7e053306db7..f607f857f1553 100644 --- a/doc/source/ray-overview/getting-started.md +++ b/doc/source/ray-overview/getting-started.md @@ -303,7 +303,7 @@ pip install -U "ray[rllib]" tensorflow # or torch ``` ```` -```{literalinclude} ../../../rllib/examples/documentation/rllib_on_ray_readme.py +```{literalinclude} ../rllib/doc_code/rllib_on_ray_readme.py :end-before: __quick_start_end__ :language: python :start-after: __quick_start_begin__ diff --git a/rllib/examples/documentation/custom_gym_env.py b/doc/source/rllib/doc_code/custom_gym_env.py similarity index 100% rename from rllib/examples/documentation/custom_gym_env.py rename to doc/source/rllib/doc_code/custom_gym_env.py diff --git a/rllib/examples/documentation/replay_buffer_demo.py b/doc/source/rllib/doc_code/replay_buffer_demo.py similarity index 100% rename from rllib/examples/documentation/replay_buffer_demo.py rename to doc/source/rllib/doc_code/replay_buffer_demo.py diff --git a/rllib/examples/documentation/rllib_on_ray_readme.py b/doc/source/rllib/doc_code/rllib_on_ray_readme.py similarity index 100% rename from rllib/examples/documentation/rllib_on_ray_readme.py rename to doc/source/rllib/doc_code/rllib_on_ray_readme.py diff --git a/rllib/examples/documentation/saving_and_loading_algos_and_policies.py b/doc/source/rllib/doc_code/saving_and_loading_algos_and_policies.py similarity index 100% rename from rllib/examples/documentation/saving_and_loading_algos_and_policies.py rename to doc/source/rllib/doc_code/saving_and_loading_algos_and_policies.py diff --git a/doc/source/rllib/package_ref/env.rst b/doc/source/rllib/package_ref/env.rst index 6d61714261bf3..856ac68cbcefc 100644 --- a/doc/source/rllib/package_ref/env.rst +++ b/doc/source/rllib/package_ref/env.rst @@ -29,11 +29,11 @@ For example, if you provide a custom `gym.Env `_ Here is a simple example: -.. literalinclude:: ../../../../rllib/examples/documentation/custom_gym_env.py +.. literalinclude:: ../doc_code/custom_gym_env.py :language: python -.. start-after: __sphinx_doc_model_construct_1_begin__ -.. end-before: __sphinx_doc_model_construct_1_end__ +.. start-after: __rllib-custom-gym-env-begin__ +.. end-before: __rllib-custom-gym-env-end__ However, you may also conveniently sub-class any of the other supported RLlib-specific environment types. The automated paths from those env types (or callables returning instances of those types) to diff --git a/doc/source/rllib/rllib-connector.rst b/doc/source/rllib/rllib-connector.rst index 9b73824f3dc3b..86b7652dbbdb6 100644 --- a/doc/source/rllib/rllib-connector.rst +++ b/doc/source/rllib/rllib-connector.rst @@ -236,7 +236,7 @@ With connectors essentially checkpointing all the transformations used during tr policies can be easily restored without the original algorithm for local inference, as demonstrated by the following Cartpole example: -.. literalinclude:: ../../../rllib/examples/connectors/v1/run_connector_policy.py +.. literalinclude:: ../../../rllib/examples/_old_api_stack/connectors/run_connector_policy.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ @@ -255,7 +255,7 @@ different environments to work together at the same time. Here is an example demonstrating adaptation of a policy trained for the standard Cartpole environment for a new mock Cartpole environment that returns additional features and requires extra action inputs. -.. literalinclude:: ../../../rllib/examples/connectors/v1/adapt_connector_policy.py +.. literalinclude:: ../../../rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py :language: python :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index 7b3dd7d42e0df..9bb556ed19cbf 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -14,23 +14,8 @@ Tuned Examples -------------- - `Tuned examples `__: - Collection of tuned hyperparameters by algorithm. -- `MuJoCo and Atari benchmarks `__: - Collection of reasonably optimized Atari and MuJoCo results. + Collection of tuned hyperparameters sorted by algorithm. -Blog Posts ----------- - -- `Attention Nets and More with RLlib’s Trajectory View API `__: - This blog describes RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. -- `Reinforcement Learning with RLlib in the Unity Game Engine `__: - A how-to on connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. -- `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: - Discussion on how we ported 12 of RLlib's algorithms from TensorFlow to PyTorch and what we learnt on the way. -- `Scaling Multi-Agent Reinforcement Learning `__: - This blog post is a brief tutorial on multi-agent RL and its design in RLlib. -- `Functional RL with Keras and TensorFlow Eager `__: - Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. Environments and Adapters ------------------------- @@ -47,7 +32,7 @@ Environments and Adapters Custom- and Complex Models -------------------------- -- `Custom Keras model `__: +- `Custom Keras model `__: Example of using a custom Keras model. - `Registering a custom model with supervised loss `__: Example of defining and registering a custom model with a supervised loss. @@ -83,9 +68,9 @@ Training Workflows Evaluation: ----------- -- `Custom evaluation function `__: +- `Custom evaluation function `__: Example of how to write a custom evaluation function that is called instead of the default behavior, which is running with the evaluation worker set through n episodes. -- `Parallel evaluation and training `__: +- `Parallel evaluation and training `__: Example showing how the evaluation workers and the "normal" rollout workers can run (to some extend) in parallel to speed up training. @@ -113,13 +98,13 @@ Serving and Offline Multi-Agent and Hierarchical ---------------------------- -- `Simple independent multi-agent setup vs a PettingZoo env `__: +- `Simple independent multi-agent setup vs a PettingZoo env `__: Setup RLlib to run any algorithm in (independent) multi-agent mode against a multi-agent environment. -- `More complex (shared-parameter) multi-agent setup vs a PettingZoo env `__: +- `More complex (shared-parameter) multi-agent setup vs a PettingZoo env `__: Setup RLlib to run any algorithm in (shared-parameter) multi-agent mode against a multi-agent environment. -- `Rock-paper-scissors `__: +- `Rock-paper-scissors `__: Example of different heuristic and learned policies competing against each other in rock-paper-scissors. -- `Two-step game `__: +- `Two-step game `__: Example of the two-step game from the `QMIX paper `__. - `PettingZoo multi-agent example `__: Example on how to use RLlib to learn in `PettingZoo `__ multi-agent environments. @@ -127,9 +112,9 @@ Multi-Agent and Hierarchical Example of customizing PPO to leverage a centralized value function. - `Centralized critic in the env `__: A simpler method of implementing a centralized critic by augmentating agent observations with global information. -- `Hand-coded policy `__: +- `Hand-coded policy `__: Example of running a custom hand-coded policy alongside trainable policies. -- `Weight sharing between policies `__: +- `Weight sharing between policies `__: Example of how to define weight-sharing layers between two different policies. - `Multiple algorithms `__: Example of alternating training between DQN and PPO. @@ -140,11 +125,11 @@ Multi-Agent and Hierarchical Special Action- and Observation Spaces -------------------------------------- -- `Nested action spaces `__: +- `Nested action spaces `__: Learning in arbitrarily nested action spaces. -- `Parametric actions `__: +- `Parametric actions `__: Example of how to handle variable-length or parametric action spaces. -- `Using the "Repeated" space of RLlib for variable lengths observations `__: +- `Using the "Repeated" space of RLlib for variable lengths observations `__: How to use RLlib's `Repeated` space to handle variable length observations. - `Autoregressive action distribution example `__: Learning with auto-regressive action dependencies (e.g. 2 action components; distribution for 2nd component depends on the 1st component's actually sampled value). @@ -185,3 +170,18 @@ Community Examples Example of training in StarCraft2 maps with RLlib / multi-agent. - `Traffic Flow `__: Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. + + +Blog Posts +---------- + +- `Attention Nets and More with RLlib’s Trajectory View API `__: + Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. +- `Reinforcement Learning with RLlib in the Unity Game Engine `__: + How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. +- `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: + Discussion on how the Ray Team ported 12 of RLlib's algorithms from TensorFlow to PyTorch and the lessons learned. +- `Scaling Multi-Agent Reinforcement Learning `__: + Blog post of a brief tutorial on multi-agent RL and its design in RLlib. +- `Functional RL with Keras and TensorFlow Eager `__: + Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. diff --git a/doc/source/rllib/rllib-replay-buffers.rst b/doc/source/rllib/rllib-replay-buffers.rst index 8a70e20a80af4..dd7b38105a470 100644 --- a/doc/source/rllib/rllib-replay-buffers.rst +++ b/doc/source/rllib/rllib-replay-buffers.rst @@ -71,7 +71,7 @@ Here are three ways of specifying a type: .. dropdown:: **Changing a replay buffer configuration** :animate: fade-in-slide-down - .. literalinclude:: ../../../rllib/examples/documentation/replay_buffer_demo.py + .. literalinclude:: doc_code/replay_buffer_demo.py :language: python :start-after: __sphinx_doc_replay_buffer_type_specification__begin__ :end-before: __sphinx_doc_replay_buffer_type_specification__end__ @@ -102,7 +102,7 @@ Advanced buffer types add functionality while trying to retain compatibility thr The following is an example of the most basic scheme of interaction with a :py:class:`~ray.rllib.utils.replay_buffers.replay_buffer.ReplayBuffer`. -.. literalinclude:: ../../../rllib/examples/documentation/replay_buffer_demo.py +.. literalinclude:: doc_code/replay_buffer_demo.py :language: python :start-after: __sphinx_doc_replay_buffer_basic_interaction__begin__ :end-before: __sphinx_doc_replay_buffer_basic_interaction__end__ @@ -113,7 +113,7 @@ Building your own ReplayBuffer Here is an example of how to implement your own toy example of a ReplayBuffer class and make SimpleQ use it: -.. literalinclude:: ../../../rllib/examples/documentation/replay_buffer_demo.py +.. literalinclude:: doc_code/replay_buffer_demo.py :language: python :start-after: __sphinx_doc_replay_buffer_own_buffer__begin__ :end-before: __sphinx_doc_replay_buffer_own_buffer__end__ @@ -132,7 +132,7 @@ When later calling the ``sample()`` method, num_items will relate to said storag Here is a full example of how to modify the storage_unit and interact with a custom buffer: -.. literalinclude:: ../../../rllib/examples/documentation/replay_buffer_demo.py +.. literalinclude:: doc_code/replay_buffer_demo.py :language: python :start-after: __sphinx_doc_replay_buffer_advanced_usage_storage_unit__begin__ :end-before: __sphinx_doc_replay_buffer_advanced_usage_storage_unit__end__ @@ -145,7 +145,7 @@ the same way as the parent's config. Here is an example of how to create an :py:class:`~ray.rllib.utils.replay_buffers.multi_agent_replay_buffer.MultiAgentReplayBuffer` with an alternative underlying :py:class:`~ray.rllib.utils.replay_buffers.replay_buffer.ReplayBuffer`. The :py:class:`~ray.rllib.utils.replay_buffers.multi_agent_replay_buffer.MultiAgentReplayBuffer` can stay the same. We only need to specify our own buffer along with a default call argument: -.. literalinclude:: ../../../rllib/examples/documentation/replay_buffer_demo.py +.. literalinclude:: doc_code/replay_buffer_demo.py :language: python :start-after: __sphinx_doc_replay_buffer_advanced_usage_underlying_buffers__begin__ :end-before: __sphinx_doc_replay_buffer_advanced_usage_underlying_buffers__end__ diff --git a/doc/source/rllib/rllib-saving-and-loading-algos-and-policies.rst b/doc/source/rllib/rllib-saving-and-loading-algos-and-policies.rst index 3535e09040305..1e47423459411 100644 --- a/doc/source/rllib/rllib-saving-and-loading-algos-and-policies.rst +++ b/doc/source/rllib/rllib-saving-and-loading-algos-and-policies.rst @@ -57,7 +57,7 @@ The :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` ``save()`` method crea Let's take a look at a simple example on how to create such an Algorithm checkpoint: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __create-algo-checkpoint-begin__ :end-before: __create-algo-checkpoint-end__ @@ -171,7 +171,7 @@ Given our checkpoint path (returned by ``Algorithm.save()``), we can now create a completely new Algorithm instance and make it the exact same as the one we had stopped (and could thus no longer use) in the example above: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __restore-from-algo-checkpoint-begin__ :end-before: __restore-from-algo-checkpoint-end__ @@ -181,7 +181,7 @@ Alternatively, you could also first create a new Algorithm instance using the same config that you used for the original algo, and only then call the new Algorithm's ``restore()`` method, passing it the checkpoint directory: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __restore-from-algo-checkpoint-2-begin__ :end-before: __restore-from-algo-checkpoint-2-end__ @@ -222,7 +222,7 @@ inside the sub-directory ``policies/``. For example: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __multi-agent-checkpoints-begin__ :end-before: __multi-agent-checkpoints-end__ @@ -237,7 +237,7 @@ However, there may be a situation where you have so many policies in your algori instance from your checkpoint, but only include some of the original policies in this new Algorithm object. In this case, you can also do: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __multi-agent-checkpoints-restore-policy-sub-set-begin__ :end-before: __multi-agent-checkpoints-restore-policy-sub-set-end__ @@ -283,7 +283,7 @@ described above or - if you need more fine-grained control - by doing the follow -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __create-policy-checkpoint-begin__ :end-before: __create-policy-checkpoint-end__ @@ -316,7 +316,7 @@ contains all its Policies' checkpoints. Here is how you can do this: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __restore-policy-begin__ :end-before: __restore-policy-end__ @@ -341,7 +341,7 @@ You can use the original checkpoint (with the 100 policies in it) and the This example here shows this for five original policies that you would like reduce to two policies: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __restore-algorithm-from-checkpoint-with-fewer-policies-begin__ :end-before: __restore-algorithm-from-checkpoint-with-fewer-policies-end__ @@ -370,7 +370,7 @@ There are several ways of creating Keras- or PyTorch native model "exports". Here is the example code that illustrates these: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __export-models-begin__ :end-before: __export-models-end__ @@ -380,21 +380,21 @@ to disk ... 1) Using the Policy object: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __export-models-1-begin__ :end-before: __export-models-1-end__ 2) Via the Policy's checkpointing method: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __export-models-2-begin__ :end-before: __export-models-2-end__ 3) Via the Algorithm (Policy) checkpoint: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __export-models-3-begin__ :end-before: __export-models-3-end__ @@ -408,7 +408,7 @@ RLlib also supports exporting your NN models in the ONNX format. For that, use t extra ``onnx`` arg as follows: -.. literalinclude:: ../../../rllib/examples/documentation/saving_and_loading_algos_and_policies.py +.. literalinclude:: doc_code/saving_and_loading_algos_and_policies.py :language: python :start-after: __export-models-as-onnx-begin__ :end-before: __export-models-as-onnx-end__ diff --git a/rllib/BUILD b/rllib/BUILD index 174f03c927e91..0dd94ce091483 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -941,13 +941,6 @@ py_test( srcs = ["env/wrappers/tests/test_group_agents_wrapper.py"] ) -py_test( - name = "env/wrappers/tests/test_recsim_wrapper", - tags = ["team:rllib", "env"], - size = "small", - srcs = ["env/wrappers/tests/test_recsim_wrapper.py"] -) - py_test( name = "env/wrappers/tests/test_unity3d_env", tags = ["team:rllib", "env"], @@ -1782,14 +1775,6 @@ py_test( srcs = ["tests/test_model_imports.py"] ) -py_test( - name = "tests/test_nested_action_spaces", - main = "tests/test_nested_action_spaces.py", - tags = ["team:rllib", "tests_dir"], - size = "large", - srcs = ["tests/test_nested_action_spaces.py"] -) - py_test( name = "tests/test_nested_observation_spaces", main = "tests/test_nested_observation_spaces.py", @@ -1966,95 +1951,123 @@ py_test( ) # -------------------------------------------------------------------- -# examples/ directory (excluding examples/documentation/...) +# examples/ directory # # Tag: examples # # NOTE: Add tests alphabetically into this list. # -------------------------------------------------------------------- +# ---------------------- +# Old API stack examples +# ---------------------- +# subdirectory: _old_api_stack/ py_test( - name = "examples/action_masking_tf2", - main = "examples/action_masking.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/_old_api_stack/complex_struct_space_tf", + main = "examples/_old_api_stack/complex_struct_space.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], size = "small", - srcs = ["examples/action_masking.py"], - args = ["--stop-iter=2", "--framework=tf2"] + srcs = ["examples/_old_api_stack/complex_struct_space.py"], + args = ["--framework=tf"], ) py_test( - name = "examples/action_masking_torch", - main = "examples/action_masking.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/_old_api_stack/complex_struct_space_tf_eager", + main = "examples/_old_api_stack/complex_struct_space.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], size = "small", - srcs = ["examples/action_masking.py"], - args = ["--stop-iter=2", "--framework=torch"] + srcs = ["examples/_old_api_stack/complex_struct_space.py"], + args = ["--framework=tf2"], ) py_test( - name = "examples/autoregressive_action_dist_tf", - main = "examples/autoregressive_action_dist.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/autoregressive_action_dist.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=150", "--num-cpus=4"] + name = "examples/_old_api_stack/complex_struct_space_torch", + main = "examples/_old_api_stack/complex_struct_space.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], + size = "small", + srcs = ["examples/_old_api_stack/complex_struct_space.py"], + args = ["--framework=torch"], ) py_test( - name = "examples/autoregressive_action_dist_torch", - main = "examples/autoregressive_action_dist.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/_old_api_stack/parametric_actions_cartpole_dqn_tf", + main = "examples/_old_api_stack/parametric_actions_cartpole.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], size = "medium", - srcs = ["examples/autoregressive_action_dist.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=150", "--num-cpus=4"] + srcs = ["examples/_old_api_stack/parametric_actions_cartpole.py"], + args = ["--as-test", "--framework=tf", "--stop-reward=60.0", "--run=DQN"] ) py_test( - name = "examples/cartpole_lstm_impala_tf2", - main = "examples/cartpole_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/_old_api_stack/parametric_actions_cartpole_dqn_torch", + main = "examples/_old_api_stack/parametric_actions_cartpole.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], size = "medium", - srcs = ["examples/cartpole_lstm.py"], - args = ["--run=IMPALA", "--as-test", "--framework=tf2", "--stop-reward=28", "--num-cpus=4"] + srcs = ["examples/_old_api_stack/parametric_actions_cartpole.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=60.0", "--run=DQN"] ) py_test( - name = "examples/cartpole_lstm_impala_torch", - main = "examples/cartpole_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model", + main = "examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], size = "medium", - srcs = ["examples/cartpole_lstm.py"], - args = ["--run=IMPALA", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4"] + srcs = ["examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py"], + args = ["--as-test", "--stop-reward=80.0"] ) -# TODO (Kourosh): tf2 ~5x slower compared to torch on the new stack py_test( - name = "examples/cartpole_lstm_ppo_tf2", - main = "examples/cartpole_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/cartpole_lstm.py"], - args = ["--run=PPO", "--as-test", "--framework=tf2", "--stop-reward=28", "--num-cpus=4"] + name = "examples/_old_api_stack/two_trainer_workflow_tf", + main = "examples/_old_api_stack/two_trainer_workflow.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], + size = "medium", + srcs = ["examples/_old_api_stack/two_trainer_workflow.py"], + args = ["--as-test", "--stop-reward=450.0"] ) py_test( - name = "examples/cartpole_lstm_ppo_torch", - main = "examples/cartpole_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/_old_api_stack/two_trainer_workflow_torch", + main = "examples/_old_api_stack/two_trainer_workflow.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], size = "medium", - srcs = ["examples/cartpole_lstm.py"], - args = ["--run=PPO", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4"] + srcs = ["examples/_old_api_stack/two_trainer_workflow.py"], + args = ["--as-test", "--torch", "--stop-reward=450.0"] ) +# subdirectory: _old_api_stack/connectors/ py_test( - name = "examples/cartpole_lstm_ppo_torch_with_prev_a_and_r", - main = "examples/cartpole_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/cartpole_lstm.py"], - args = ["--run=PPO", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4", "--use-prev-action", "--use-prev-reward"] + name = "examples/_old_api_stack/connectors/run_connector_policy", + main = "examples/_old_api_stack/connectors/run_connector_policy.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], + size = "small", + srcs = ["examples/_old_api_stack/connectors/run_connector_policy.py"], +) + +py_test( + name = "examples/_old_api_stack/connectors/adapt_connector_policy", + main = "examples/_old_api_stack/connectors/adapt_connector_policy.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], + size = "small", + srcs = ["examples/_old_api_stack/connectors/adapt_connector_policy.py"], +) + +py_test( + name = "examples/_old_api_stack/connectors/self_play_with_policy_checkpoint", + main = "examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py", + tags = ["team:rllib", "exclusive", "examples", "old_api_stack"], + size = "small", + srcs = ["examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py"], + args = [ + "--train_iteration=1" # Smoke test. + ] ) +# ---------------------- +# New API stack +# ---------------------- + +# subdirectory: catalog/ +# .................................... py_test( name = "examples/catalog/custom_action_distribution", main = "examples/catalog/custom_action_distribution.py", @@ -2071,909 +2084,828 @@ py_test( srcs = ["examples/catalog/mobilenet_v2_encoder.py"], ) +# subdirectory: connectors/ +# .................................... +# Framestacking examples only run in smoke-test mode (a few iters only). py_test( - name = "examples/rl_module/mobilenet_rlm", - main = "examples/rl_module/mobilenet_rlm.py", - tags = ["team:rllib", "examples", "no_main"], - size = "small", - srcs = ["examples/rl_module/mobilenet_rlm.py"], + name = "examples/connectors/connector_v2_frame_stacking_ppo", + main = "examples/connectors/connector_v2_frame_stacking.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "medium", + srcs = ["examples/connectors/connector_v2_frame_stacking.py"], + args = ["--enable-new-api-stack", "--stop-iter=2", "--framework=torch", "--algo=PPO"] ) py_test( - name = "examples/centralized_critic_tf", - main = "examples/centralized_critic.py", + name = "examples/connectors/connector_v2_frame_stacking_multi_agent_ppo", + main = "examples/connectors/connector_v2_frame_stacking.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/centralized_critic.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=7.2"] + srcs = ["examples/connectors/connector_v2_frame_stacking.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] ) +# Nested action spaces (flattening obs and learning w/ multi-action distribution). py_test( - name = "examples/centralized_critic_torch", - main = "examples/centralized_critic.py", + name = "examples/connectors/connector_v2_nested_action_spaces_ppo", + main = "examples/connectors/connector_v2_nested_action_spaces.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/centralized_critic.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=7.2"] + size = "large", + srcs = ["examples/connectors/connector_v2_nested_action_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"] ) py_test( - name = "examples/centralized_critic_2_tf", - main = "examples/centralized_critic_2.py", + name = "examples/connectors/connector_v2_nested_action_spaces_multi_agent_ppo", + main = "examples/connectors/connector_v2_nested_action_spaces.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/centralized_critic_2.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=6.0"] + size = "large", + srcs = ["examples/connectors/connector_v2_nested_action_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"] ) +# Nested observation spaces (flattening). py_test( - name = "examples/centralized_critic_2_torch", - main = "examples/centralized_critic_2.py", + name = "examples/connectors/connector_v2_nested_observation_spaces_ppo", + main = "examples/connectors/connector_v2_nested_observation_spaces.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/centralized_critic_2.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=6.0"] + srcs = ["examples/connectors/connector_v2_nested_observation_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO"] ) py_test( - name = "examples/checkpoint_by_custom_criteria", - main = "examples/checkpoint_by_custom_criteria.py", + name = "examples/connectors/connector_v2_nested_observation_spaces_multi_agent_ppo", + main = "examples/connectors/connector_v2_nested_observation_spaces.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/checkpoint_by_custom_criteria.py"], - args = ["--stop-iters=3 --num-cpus=3"] + srcs = ["examples/connectors/connector_v2_nested_observation_spaces.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--framework=torch", "--algo=PPO"] ) +# Prev-r/prev actions + LSTM example. py_test( - name = "examples/complex_struct_space_tf", - main = "examples/complex_struct_space.py", + name = "examples/connectors/connector_v2_prev_actions_prev_rewards_ppo", + main = "examples/connectors/connector_v2_prev_actions_prev_rewards.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/complex_struct_space.py"], - args = ["--framework=tf"], + size = "large", + srcs = ["examples/connectors/connector_v2_prev_actions_prev_rewards.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=200.0", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] ) py_test( - name = "examples/complex_struct_space_tf_eager", - main = "examples/complex_struct_space.py", + name = "examples/connectors/connector_v2_prev_actions_prev_rewards_multi_agent_ppo", + main = "examples/connectors/connector_v2_prev_actions_prev_rewards.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/complex_struct_space.py"], - args = ["--framework=tf2"], + size = "large", + srcs = ["examples/connectors/connector_v2_prev_actions_prev_rewards.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] ) +# MeanStd filtering example. py_test( - name = "examples/complex_struct_space_torch", - main = "examples/complex_struct_space.py", + name = "examples/connectors/connector_v2_mean_std_filtering_ppo", + main = "examples/connectors/connector_v2_mean_std_filtering.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/complex_struct_space.py"], - args = ["--framework=torch"], + size = "medium", + srcs = ["examples/connectors/connector_v2_mean_std_filtering.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=-300.0", "--framework=torch", "--algo=PPO", "--num-env-runners=2"] ) py_test( - name = "examples/curriculum_learning", - main = "examples/curriculum_learning.py", + name = "examples/connectors/connector_v2_mean_std_filtering_multi_agent_ppo", + main = "examples/connectors/connector_v2_mean_std_filtering.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/curriculum_learning.py"], - args = ["--as-test", "--stop-reward=800.0"] + srcs = ["examples/connectors/connector_v2_mean_std_filtering.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=-600.0", "--framework=torch", "--algo=PPO", "--num-env-runners=5", "--num-cpus=6"] ) +# subdirectory: evaluation/ +# .................................... py_test( - name = "examples/custom_env_tf", - main = "examples/custom_env.py", + name = "examples/evaluation/custom_evaluation", + main = "examples/evaluation/custom_evaluation.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/custom_env.py"], - args = ["--as-test", "--framework=tf"] + srcs = ["examples/evaluation/custom_evaluation.py"], + args = ["--enable-new-api-stack", "--framework=torch", "--as-test", "--stop-reward=0.75", "--num-cpus=5"] ) py_test( - name = "examples/custom_env_torch", - main = "examples/custom_env.py", + name = "examples/evaluation/custom_evaluation_parallel_to_training", + main = "examples/evaluation/custom_evaluation.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/custom_env.py"], - args = ["--as-test", "--framework=torch"] + srcs = ["examples/evaluation/custom_evaluation.py"], + args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=0.75", "--evaluation-parallel-to-training", "--num-cpus=5"] ) py_test( - name = "examples/custom_eval_tf", - main = "examples/custom_eval.py", + name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_torch_envrunner", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/custom_eval.py"], - args = ["--num-cpus=4", "--framework=tf", "--as-test"] + size = "medium", + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"] ) py_test( - name = "examples/custom_eval_torch", - main = "examples/custom_eval.py", + name = "examples/evaluation/evaluation_parallel_to_training_multi_agent_duration_auto_torch_envrunner", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/custom_eval.py"], - args = ["--num-cpus=4", "--as-test", "--framework=torch"] + size = "medium", + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"] ) py_test( - name = "examples/custom_eval_parallel_to_training_torch", - main = "examples/custom_eval.py", + name = "examples/evaluation/evaluation_parallel_to_training_511_ts_torch_envrunner", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/custom_eval.py"], - args = ["--num-cpus=4", "--as-test", "--framework=torch", "--evaluation-parallel-to-training"] + size = "medium", + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-workers=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"] ) py_test( - name = "examples/custom_experiment", - main = "examples/custom_experiment.py", + name = "examples/evaluation/evaluation_parallel_to_training_multi_agent_1001_ts_torch_envrunner", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/custom_experiment.py"], - args = ["--train-iterations=10"] + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"] ) py_test( - name = "examples/custom_metrics_and_callbacks", - main = "examples/custom_metrics_and_callbacks.py", + name = "examples/evaluation/evaluation_parallel_to_training_13_episodes_torch_envrunner", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/custom_metrics_and_callbacks.py"], - args = ["--stop-iters=2"] + size = "medium", + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"] ) py_test( - name = "examples/custom_model_loss_and_metrics_ppo_tf", - main = "examples/custom_model_loss_and_metrics.py", + name = "examples/evaluation/evaluation_parallel_to_training_multi_agent_10_episodes_torch_envrunner", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - # Include the json data file. - data = ["tests/data/cartpole/small.json"], - srcs = ["examples/custom_model_loss_and_metrics.py"], - args = ["--run=PPO", "--stop-iters=1", "--framework=tf","--input-files=tests/data/cartpole"] + size = "medium", + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"] ) +# subdirectory: multi_agent_and_self_play/ +# .................................... py_test( - name = "examples/custom_model_loss_and_metrics_ppo_torch", - main = "examples/custom_model_loss_and_metrics.py", + name = "examples/multi_agent_and_self_play/custom_heuristic_policy", + main = "examples/multi_agent_and_self_play/custom_heuristic_policy.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - # Include the json data file. - data = ["tests/data/cartpole/small.json"], - srcs = ["examples/custom_model_loss_and_metrics.py"], - args = ["--run=PPO", "--framework=torch", "--stop-iters=1", "--input-files=tests/data/cartpole"] + size = "medium", + srcs = ["examples/multi_agent_and_self_play/custom_heuristic_policy.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=450.0"] ) py_test( - name = "examples/custom_model_loss_and_metrics_pg_tf", - main = "examples/custom_model_loss_and_metrics.py", + name = "examples/multi_agent_and_self_play/different_spaces_for_agents_ppo", + main = "examples/multi_agent_and_self_play/different_spaces_for_agents.py", tags = ["team:rllib", "exclusive", "examples"], size = "small", - # Include the json data file. - data = ["tests/data/cartpole/small.json"], - srcs = ["examples/custom_model_loss_and_metrics.py"], - args = ["--stop-iters=1", "--framework=tf", "--input-files=tests/data/cartpole"] + srcs = ["examples/multi_agent_and_self_play/different_spaces_for_agents.py"], + args = ["--enable-new-api-stack", "--algo=PPO", "--stop-iters=4", "--framework=torch"] ) py_test( - name = "examples/custom_model_loss_and_metrics_pg_torch", - main = "examples/custom_model_loss_and_metrics.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "small", - # Include the json data file. - data = ["tests/data/cartpole/small.json"], - srcs = ["examples/custom_model_loss_and_metrics.py"], - args = ["--framework=torch", "--stop-iters=1", "--input-files=tests/data/cartpole"] + name = "examples/multi_agent_and_self_play/pettingzoo_independent_learning", + main = "examples/multi_agent_and_self_play/pettingzoo_independent_learning.py", + tags = ["team:rllib", "examples"], + size = "large", + srcs = ["examples/multi_agent_and_self_play/pettingzoo_independent_learning.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=-200.0", "--num-cpus=4"] ) py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_after_me_tf2", - main = "examples/custom_recurrent_rnn_tokenizer.py", + name = "examples/multi_agent_and_self_play/multi_agent_cartpole", + main = "examples/multi_agent_and_self_play/multi_agent_cartpole.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] + srcs = ["examples/multi_agent_and_self_play/multi_agent_cartpole.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=600.0", "--num-cpus=4"] ) +# TODO (sven): Activate once MultiAgentEpisode splitting works and we can do multi-agent + multi-gpu. +# py_test( +# name = "examples/multi_agent_and_self_play/multi_agent_cartpole_multi_gpu", +# main = "examples/multi_agent_and_self_play/multi_agent_cartpole.py", +# tags = ["team:rllib", "exclusive", "examples", "multi_gpu"], +# size = "medium", +# srcs = ["examples/multi_agent_and_self_play/multi_agent_cartpole.py"], +# args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=600.0", "--num-cpus=4", "--num-gpus=2"] +# ) + py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_initial_obs_env_tf2", - main = "examples/custom_recurrent_rnn_tokenizer.py", - tags = ["team:rllib", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_after_me_torch", - main = "examples/custom_recurrent_rnn_tokenizer.py", + name = "examples/multi_agent_and_self_play/pettingzoo_parameter_sharing", + main = "examples/multi_agent_and_self_play/pettingzoo_parameter_sharing.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] + size = "large", + srcs = ["examples/multi_agent_and_self_play/pettingzoo_parameter_sharing.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=-210.0", "--num-cpus=4"], ) -py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_initial_obs_env_torch", - main = "examples/custom_recurrent_rnn_tokenizer.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] -) +# TODO (sven): Activate this test once this script is ready. +# py_test( +# name = "examples/multi_agent_and_self_play/pettingzoo_shared_value_function", +# main = "examples/multi_agent_and_self_play/pettingzoo_shared_value_function.py", +# tags = ["team:rllib", "exclusive", "examples"], +# size = "large", +# srcs = ["examples/multi_agent_and_self_play/pettingzoo_shared_value_function.py"], +# args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=-100.0", "--num-cpus=4"], +# ) py_test( - name = "examples/custom_train_fn", - main = "examples/custom_train_fn.py", + name = "examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt", + main = "examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/custom_train_fn.py"], + srcs = ["examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=6.5"], ) py_test( - name = "examples/deterministic_training_tf", - main = "examples/deterministic_training.py", - tags = ["team:rllib", "exclusive", "multi_gpu", "examples"], - size = "medium", - srcs = ["examples/deterministic_training.py"], - args = ["--as-test", "--stop-iters=1", "--framework=tf", "--num-gpus=1", "--num-gpus-per-worker=1"] -) - -py_test( - name = "examples/deterministic_training_tf2", - main = "examples/deterministic_training.py", - tags = ["team:rllib", "exclusive", "multi_gpu", "examples"], - size = "medium", - srcs = ["examples/deterministic_training.py"], - args = ["--as-test", "--stop-iters=1", "--framework=tf2", "--num-gpus=1", "--num-gpus-per-worker=1"] -) - -py_test( - name = "examples/deterministic_training_torch", - main = "examples/deterministic_training.py", - tags = ["team:rllib", "exclusive", "multi_gpu", "examples"], - size = "medium", - srcs = ["examples/deterministic_training.py"], - args = ["--as-test", "--stop-iters=1", "--framework=torch", "--num-gpus=1", "--num-gpus-per-worker=1"] -) - -py_test( - name = "examples/env/greyscale_env", - tags = ["team:rllib", "examples", "no_main"], - size = "medium", - srcs = ["examples/env/greyscale_env.py"], - args = ["--stop-iters=1 --as-test --framework torch"] + name = "examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt_w_lstm", + main = "examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=7.2", "--use-lstm", "--num-env-runners=4", "--num-cpus=6"], ) -# New API Stack py_test( - name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_torch_envrunner", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt", + main = "examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"] + srcs = ["examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--framework=torch", "--stop-iter=10"], ) +# @OldAPIStack py_test( - name = "examples/evaluation/evaluation_parallel_to_training_multi_agent_duration_auto_torch_envrunner", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/multi_agent_and_self_play/self_play_with_open_spiel_connect_4_ppo_tf_old_api_stack", + main = "examples/multi_agent_and_self_play/self_play_with_open_spiel.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"] + srcs = ["examples/multi_agent_and_self_play/self_play_with_open_spiel.py"], + args = ["--framework=tf", "--env=connect_four", "--win-rate-threshold=0.9", "--num-episodes-human-play=0", "--min-league-size=3"] ) +# @OldAPIStack py_test( - name = "examples/evaluation/evaluation_parallel_to_training_511_ts_torch_envrunner", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/multi_agent_and_self_play/self_play_with_open_spiel_connect_4_ppo_torch_old_api_stack", + main = "examples/multi_agent_and_self_play/self_play_with_open_spiel.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-workers=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"] + srcs = ["examples/multi_agent_and_self_play/self_play_with_open_spiel.py"], + args = ["--framework=torch", "--env=connect_four", "--win-rate-threshold=0.9", "--num-episodes-human-play=0", "--min-league-size=3"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_multi_agent_1001_ts_torch_envrunner", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/multi_agent_and_self_play/self_play_with_open_spiel_connect_4_ppo_torch_envrunner", + main = "examples/multi_agent_and_self_play/self_play_with_open_spiel.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"] + srcs = ["examples/multi_agent_and_self_play/self_play_with_open_spiel.py"], + args = ["--enable-new-api-stack", "--framework=torch", "--env=connect_four", "--win-rate-threshold=0.9", "--num-episodes-human-play=0", "--min-league-size=4"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_13_episodes_torch_envrunner", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel_connect_4_ppo_torch_envrunner", + main = "examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"] + size = "large", + srcs = ["examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel.py"], + args = ["--enable-new-api-stack", "--framework=torch", "--env=connect_four", "--win-rate-threshold=0.8", "--num-episodes-human-play=0", "--min-league-size=8"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_multi_agent_10_episodes_torch_envrunner", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/multi_agent_and_self_play/two_step_game_with_grouped_agents", + main = "examples/multi_agent_and_self_play/two_step_game_with_grouped_agents.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"] + srcs = ["examples/multi_agent_and_self_play/two_step_game_with_grouped_agents.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=7.0"] ) -# END: New API Stack +# TODO (sven): Continue cleanup from here with adding more subdirectories. py_test( - name = "examples/evaluation/evaluation_parallel_to_training_13_episodes_tf", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/action_masking_tf2", + main = "examples/action_masking.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=13"] + size = "small", + srcs = ["examples/action_masking.py"], + args = ["--stop-iter=2", "--framework=tf2"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_tf", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/action_masking_torch", + main = "examples/action_masking.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--stop-reward=50.0", "--framework=tf", "--num-cpus=6", "--evaluation-duration=auto"] + size = "small", + srcs = ["examples/action_masking.py"], + args = ["--stop-iter=2", "--framework=torch"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_torch", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/autoregressive_action_dist_tf", + main = "examples/autoregressive_action_dist.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"] + srcs = ["examples/autoregressive_action_dist.py"], + args = ["--as-test", "--framework=tf", "--stop-reward=150", "--num-cpus=4"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_tf2", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/autoregressive_action_dist_torch", + main = "examples/autoregressive_action_dist.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-workers=3", "--evaluation-duration=auto"] + srcs = ["examples/autoregressive_action_dist.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=150", "--num-cpus=4"] ) py_test( - name = "examples/evaluation/evaluation_parallel_to_training_211_ts_tf2", - main = "examples/evaluation/evaluation_parallel_to_training.py", + name = "examples/cartpole_lstm_impala_tf2", + main = "examples/cartpole_lstm.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-workers=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"] + srcs = ["examples/cartpole_lstm.py"], + args = ["--run=IMPALA", "--as-test", "--framework=tf2", "--stop-reward=28", "--num-cpus=4"] ) py_test( - name = "examples/export/cartpole_dqn_export", - main = "examples/export/cartpole_dqn_export.py", + name = "examples/cartpole_lstm_impala_torch", + main = "examples/cartpole_lstm.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/export/cartpole_dqn_export.py"], -) - -py_test( - name = "examples/export/onnx_tf", - main = "examples/export/onnx_tf.py", - tags = ["team:rllib", "exclusive", "examples", "no_main"], - size = "small", - srcs = ["examples/export/onnx_tf.py"], - args = ["--framework=tf"], -) - -py_test( - name = "examples/export/onnx_tf2", - main = "examples/export/onnx_tf.py", - tags = ["team:rllib", "exclusive", "examples", "no_main"], - size = "small", - srcs = ["examples/export/onnx_tf.py"], - args = ["--framework=tf2"], -) - -py_test( - name = "examples/export/onnx_torch", - main = "examples/export/onnx_torch.py", - tags = ["team:rllib", "exclusive", "examples", "no_main"], - size = "small", - srcs = ["examples/export/onnx_torch.py"], + size = "medium", + srcs = ["examples/cartpole_lstm.py"], + args = ["--run=IMPALA", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4"] ) +# TODO (Kourosh): tf2 ~5x slower compared to torch on the new stack py_test( - name = "examples/fractional_gpus", - main = "examples/fractional_gpus.py", + name = "examples/cartpole_lstm_ppo_tf2", + main = "examples/cartpole_lstm.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/fractional_gpus.py"], - args = ["--as-test", "--stop-reward=40.0", "--num-gpus=0", "--num-workers=0"] + size = "large", + srcs = ["examples/cartpole_lstm.py"], + args = ["--run=PPO", "--as-test", "--framework=tf2", "--stop-reward=28", "--num-cpus=4"] ) py_test( - name = "examples/hierarchical_training_tf", - main = "examples/hierarchical_training.py", + name = "examples/cartpole_lstm_ppo_torch", + main = "examples/cartpole_lstm.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/hierarchical_training.py"], - args = [ "--framework=tf", "--stop-reward=0.0"] + srcs = ["examples/cartpole_lstm.py"], + args = ["--run=PPO", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4"] ) py_test( - name = "examples/hierarchical_training_torch", - main = "examples/hierarchical_training.py", + name = "examples/cartpole_lstm_ppo_torch_with_prev_a_and_r", + main = "examples/cartpole_lstm.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/hierarchical_training.py"], - args = ["--framework=torch", "--stop-reward=0.0"] + srcs = ["examples/cartpole_lstm.py"], + args = ["--run=PPO", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4", "--use-prev-action", "--use-prev-reward"] ) py_test( - name = "examples/multi_agent_cartpole_tf", - main = "examples/multi_agent_cartpole.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/rl_module/mobilenet_rlm", + main = "examples/rl_module/mobilenet_rlm.py", + tags = ["team:rllib", "examples", "no_main"], size = "small", - srcs = ["examples/multi_agent_cartpole.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=70.0", "--num-cpus=4"] + srcs = ["examples/rl_module/mobilenet_rlm.py"], ) py_test( - name = "examples/multi_agent_cartpole_torch", - main = "examples/multi_agent_cartpole.py", + name = "examples/centralized_critic_tf", + main = "examples/centralized_critic.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/multi_agent_cartpole.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=70.0", "--num-cpus=4"] -) - -py_test( - name = "examples/multi_agent_cartpole_w_rlm_torch", - main = "examples/multi_agent_cartpole.py", - tags = ["team:rllib", "exclusive", "examples", "rlm"], - size = "medium", - srcs = ["examples/multi_agent_cartpole.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=70.0", "--num-cpus=4"] -) - -py_test( - name = "examples/multi_agent_cartpole_w_rlm_tf2", - main = "examples/multi_agent_cartpole.py", - tags = ["team:rllib", "exclusive", "examples", "rlm"], size = "medium", - srcs = ["examples/multi_agent_cartpole.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=70.0", "--num-cpus=4"] + srcs = ["examples/centralized_critic.py"], + args = ["--as-test", "--framework=tf", "--stop-reward=7.2"] ) py_test( - name = "examples/multi_agent_custom_policy_tf", - main = "examples/multi_agent_custom_policy.py", + name = "examples/centralized_critic_torch", + main = "examples/centralized_critic.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_custom_policy.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=80"] + srcs = ["examples/centralized_critic.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=7.2"] ) py_test( - name = "examples/multi_agent_custom_policy_torch", - main = "examples/multi_agent_custom_policy.py", + name = "examples/centralized_critic_2_tf", + main = "examples/centralized_critic_2.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_custom_policy.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=80"] -) - -py_test( - name = "examples/multi_agent_custom_policy_w_rlm_torch", - main = "examples/multi_agent_custom_policy.py", - tags = ["team:rllib", "exclusive", "examples", "rlm"], - size = "medium", - srcs = ["examples/multi_agent_custom_policy.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=80"] -) - -py_test( - name = "examples/multi_agent_custom_policy_w_rlm_tf2", - main = "examples/multi_agent_custom_policy.py", - tags = ["team:rllib", "exclusive", "examples", "rlm"], - size = "medium", - srcs = ["examples/multi_agent_custom_policy.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=80"] + srcs = ["examples/centralized_critic_2.py"], + args = ["--as-test", "--framework=tf", "--stop-reward=6.0"] ) py_test( - name = "examples/multi_agent_different_spaces_for_agents_tf2", - main = "examples/multi_agent_different_spaces_for_agents.py", + name = "examples/centralized_critic_2_torch", + main = "examples/centralized_critic_2.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_different_spaces_for_agents.py"], - args = ["--stop-iters=4", "--framework=tf2"] + srcs = ["examples/centralized_critic_2.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=6.0"] ) py_test( - name = "examples/multi_agent_different_spaces_for_agents_torch", - main = "examples/multi_agent_different_spaces_for_agents.py", + name = "examples/checkpoint_by_custom_criteria", + main = "examples/checkpoint_by_custom_criteria.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_different_spaces_for_agents.py"], - args = ["--stop-iters=4", "--framework=torch"] + srcs = ["examples/checkpoint_by_custom_criteria.py"], + args = ["--stop-iters=3 --num-cpus=3"] ) py_test( - name = "examples/multi_agent_different_spaces_for_agents_w_rlm_torch", - main = "examples/multi_agent_different_spaces_for_agents.py", - tags = ["team:rllib", "exclusive", "examples", "rlm"], + name = "examples/curriculum_learning", + main = "examples/curriculum_learning.py", + tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_different_spaces_for_agents.py"], - args = ["--stop-iters=4", "--framework=torch"] + srcs = ["examples/curriculum_learning.py"], + args = ["--as-test", "--stop-reward=800.0"] ) py_test( - name = "examples/multi_agent_different_spaces_for_agents_w_rlm_tf2", - main = "examples/multi_agent_different_spaces_for_agents.py", - tags = ["team:rllib", "exclusive", "examples", "rlm"], + name = "examples/custom_env_tf", + main = "examples/custom_env.py", + tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_different_spaces_for_agents.py"], - args = ["--stop-iters=4", "--framework=tf2"] + srcs = ["examples/custom_env.py"], + args = ["--as-test", "--framework=tf"] ) py_test( - name = "examples/multi_agent_independent_learning", - main = "examples/multi_agent_independent_learning.py", - tags = ["team:rllib", "examples"], + name = "examples/custom_env_torch", + main = "examples/custom_env.py", + tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_independent_learning.py"], - args = ["--num-gpus=0", "--as-test"] + srcs = ["examples/custom_env.py"], + args = ["--as-test", "--framework=torch"] ) py_test( - name = "examples/multi_agent_two_trainers_tf", - main = "examples/multi_agent_two_trainers.py", + name = "examples/custom_experiment", + main = "examples/custom_experiment.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/multi_agent_two_trainers.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=70"] + srcs = ["examples/custom_experiment.py"], + args = ["--train-iterations=10"] ) py_test( - name = "examples/multi_agent_two_trainers_torch", - main = "examples/multi_agent_two_trainers.py", + name = "examples/custom_metrics_and_callbacks", + main = "examples/custom_metrics_and_callbacks.py", tags = ["team:rllib", "exclusive", "examples"], size = "small", - srcs = ["examples/multi_agent_two_trainers.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=70"] -) - -py_test( - name = "examples/offline_rl_torch", - main = "examples/offline_rl.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/offline_rl.py"], - args = ["--as-test", "--stop-reward=-300", "--stop-iters=1"] + srcs = ["examples/custom_metrics_and_callbacks.py"], + args = ["--stop-iters=2"] ) py_test( - name = "examples/nested_action_spaces_ppo_tf", - main = "examples/nested_action_spaces.py", + name = "examples/custom_model_loss_and_metrics_ppo_tf", + main = "examples/custom_model_loss_and_metrics.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/nested_action_spaces.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=-600", "--algo=PPO"] + size = "small", + # Include the json data file. + data = ["tests/data/cartpole/small.json"], + srcs = ["examples/custom_model_loss_and_metrics.py"], + args = ["--run=PPO", "--stop-iters=1", "--framework=tf","--input-files=tests/data/cartpole"] ) py_test( - name = "examples/nested_action_spaces_ppo_torch_envrunner", - main = "examples/nested_action_spaces.py", + name = "examples/custom_model_loss_and_metrics_ppo_torch", + main = "examples/custom_model_loss_and_metrics.py", tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/nested_action_spaces.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO", "--enable-new-api-stack"] + size = "small", + # Include the json data file. + data = ["tests/data/cartpole/small.json"], + srcs = ["examples/custom_model_loss_and_metrics.py"], + args = ["--run=PPO", "--framework=torch", "--stop-iters=1", "--input-files=tests/data/cartpole"] ) py_test( - name = "examples/nested_action_spaces_multi_agent_ppo_torch_envrunner", - main = "examples/nested_action_spaces.py", + name = "examples/custom_model_loss_and_metrics_pg_tf", + main = "examples/custom_model_loss_and_metrics.py", tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/nested_action_spaces.py"], - args = ["--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO", "--enable-new-api-stack"] + size = "small", + # Include the json data file. + data = ["tests/data/cartpole/small.json"], + srcs = ["examples/custom_model_loss_and_metrics.py"], + args = ["--stop-iters=1", "--framework=tf", "--input-files=tests/data/cartpole"] ) py_test( - name = "examples/parametric_actions_cartpole_dqn_tf", - main = "examples/parametric_actions_cartpole.py", + name = "examples/custom_model_loss_and_metrics_pg_torch", + main = "examples/custom_model_loss_and_metrics.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/parametric_actions_cartpole.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=60.0", "--run=DQN"] + size = "small", + # Include the json data file. + data = ["tests/data/cartpole/small.json"], + srcs = ["examples/custom_model_loss_and_metrics.py"], + args = ["--framework=torch", "--stop-iters=1", "--input-files=tests/data/cartpole"] ) py_test( - name = "examples/parametric_actions_cartpole_dqn_torch", - main = "examples/parametric_actions_cartpole.py", + name = "examples/custom_recurrent_rnn_tokenizer_repeat_after_me_tf2", + main = "examples/custom_recurrent_rnn_tokenizer.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/parametric_actions_cartpole.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=60.0", "--run=DQN"] + srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], + args = ["--as-test", "--framework=tf2", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] ) py_test( - name = "examples/parametric_actions_cartpole_embeddings_learnt_by_model", - main = "examples/parametric_actions_cartpole_embeddings_learnt_by_model.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/custom_recurrent_rnn_tokenizer_repeat_initial_obs_env_tf2", + main = "examples/custom_recurrent_rnn_tokenizer.py", + tags = ["team:rllib", "examples"], size = "medium", - srcs = ["examples/parametric_actions_cartpole_embeddings_learnt_by_model.py"], - args = ["--as-test", "--stop-reward=80.0"] + srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], + args = ["--as-test", "--framework=tf2", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] ) py_test( - name = "examples/inference_and_serving/policy_inference_after_training_tf", - main = "examples/inference_and_serving/policy_inference_after_training.py", + name = "examples/custom_recurrent_rnn_tokenizer_repeat_after_me_torch", + main = "examples/custom_recurrent_rnn_tokenizer.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/inference_and_serving/policy_inference_after_training.py"], - args = ["--stop-iters=3", "--framework=tf"] + srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] ) py_test( - name = "examples/inference_and_serving/policy_inference_after_training_torch", - main = "examples/inference_and_serving/policy_inference_after_training.py", + name = "examples/custom_recurrent_rnn_tokenizer_repeat_initial_obs_env_torch", + main = "examples/custom_recurrent_rnn_tokenizer.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/inference_and_serving/policy_inference_after_training.py"], - args = ["--stop-iters=3", "--framework=torch"] + srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] ) py_test( - name = "examples/inference_and_serving/policy_inference_after_training_with_attention_tf", - main = "examples/inference_and_serving/policy_inference_after_training_with_attention.py", + name = "examples/custom_train_fn", + main = "examples/custom_train_fn.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/inference_and_serving/policy_inference_after_training_with_attention.py"], - args = ["--stop-iters=2", "--framework=tf"] + srcs = ["examples/custom_train_fn.py"], ) py_test( - name = "examples/inference_and_serving/policy_inference_after_training_with_attention_torch", - main = "examples/inference_and_serving/policy_inference_after_training_with_attention.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/deterministic_training_tf", + main = "examples/deterministic_training.py", + tags = ["team:rllib", "exclusive", "multi_gpu", "examples"], size = "medium", - srcs = ["examples/inference_and_serving/policy_inference_after_training_with_attention.py"], - args = ["--stop-iters=2", "--framework=torch"] + srcs = ["examples/deterministic_training.py"], + args = ["--as-test", "--stop-iters=1", "--framework=tf", "--num-gpus=1", "--num-gpus-per-worker=1"] ) py_test( - name = "examples/inference_and_serving/policy_inference_after_training_with_lstm_tf", - main = "examples/inference_and_serving/policy_inference_after_training_with_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/deterministic_training_tf2", + main = "examples/deterministic_training.py", + tags = ["team:rllib", "exclusive", "multi_gpu", "examples"], size = "medium", - srcs = ["examples/inference_and_serving/policy_inference_after_training_with_lstm.py"], - args = ["--stop-iters=1", "--framework=tf"] + srcs = ["examples/deterministic_training.py"], + args = ["--as-test", "--stop-iters=1", "--framework=tf2", "--num-gpus=1", "--num-gpus-per-worker=1"] ) py_test( - name = "examples/inference_and_serving/policy_inference_after_training_with_lstm_torch", - main = "examples/inference_and_serving/policy_inference_after_training_with_lstm.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/deterministic_training_torch", + main = "examples/deterministic_training.py", + tags = ["team:rllib", "exclusive", "multi_gpu", "examples"], size = "medium", - srcs = ["examples/inference_and_serving/policy_inference_after_training_with_lstm.py"], - args = ["--stop-iters=1", "--framework=torch"] + srcs = ["examples/deterministic_training.py"], + args = ["--as-test", "--stop-iters=1", "--framework=torch", "--num-gpus=1", "--num-gpus-per-worker=1"] ) py_test( - name = "examples/replay_buffer_api", - tags = ["team:rllib", "examples"], - size = "large", - srcs = ["examples/replay_buffer_api.py"], + name = "examples/env/greyscale_env", + tags = ["team:rllib", "examples", "no_main"], + size = "medium", + srcs = ["examples/env/greyscale_env.py"], + args = ["--stop-iters=1 --as-test --framework torch"] ) + + + + py_test( - name = "examples/restore_1_of_n_agents_from_checkpoint", + name = "examples/evaluation/evaluation_parallel_to_training_13_episodes_tf", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/restore_1_of_n_agents_from_checkpoint.py"], - args = ["--pre-training-iters=1", "--stop-iters=1", "--num-cpus=4"] + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--as-test", "--framework=tf", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=13"] ) py_test( - name = "examples/rock_paper_scissors_multiagent_tf", - main = "examples/rock_paper_scissors_multiagent.py", + name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_tf", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/rock_paper_scissors_multiagent.py"], - args = ["--as-test", "--framework=tf"], + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--as-test", "--stop-reward=50.0", "--framework=tf", "--num-cpus=6", "--evaluation-duration=auto"] ) py_test( - name = "examples/rock_paper_scissors_multiagent_torch", - main = "examples/rock_paper_scissors_multiagent.py", + name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_torch", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/rock_paper_scissors_multiagent.py"], - args = ["--as-test", "--framework=torch"], + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--as-test", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"] ) py_test( - name = "examples/self_play_with_open_spiel_connect_4_ppo_tf", - main = "examples/self_play_with_open_spiel.py", + name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_tf2", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/self_play_with_open_spiel.py"], - args = ["--framework=tf", "--env=connect_four", "--win-rate-threshold=0.9", "--num-episodes-human-play=0", "--min-league-size=3"] + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--as-test", "--framework=tf2", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-workers=3", "--evaluation-duration=auto"] ) py_test( - name = "examples/self_play_with_open_spiel_connect_4_ppo_torch", - main = "examples/self_play_with_open_spiel.py", + name = "examples/evaluation/evaluation_parallel_to_training_211_ts_tf2", + main = "examples/evaluation/evaluation_parallel_to_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/self_play_with_open_spiel.py"], - args = ["--framework=torch", "--env=connect_four", "--win-rate-threshold=0.9", "--num-episodes-human-play=0", "--min-league-size=3"] + srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], + args = ["--as-test", "--framework=tf2", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-workers=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"] ) py_test( - name = "examples/self_play_with_open_spiel_connect_4_ppo_torch_envrunner", - main = "examples/self_play_with_open_spiel.py", + name = "examples/export/cartpole_dqn_export", + main = "examples/export/cartpole_dqn_export.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/self_play_with_open_spiel.py"], - args = ["--enable-new-api-stack", "--framework=torch", "--env=connect_four", "--win-rate-threshold=0.9", "--num-episodes-human-play=0", "--min-league-size=4"] + size = "small", + srcs = ["examples/export/cartpole_dqn_export.py"], ) py_test( - name = "examples/self_play_league_based_with_open_spiel_connect_4_ppo_torch_envrunner", - main = "examples/self_play_league_based_with_open_spiel.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/self_play_league_based_with_open_spiel.py"], - args = ["--enable-new-api-stack", "--framework=torch", "--env=connect_four", "--win-rate-threshold=0.8", "--num-episodes-human-play=0", "--min-league-size=8"] + name = "examples/export/onnx_tf", + main = "examples/export/onnx_tf.py", + tags = ["team:rllib", "exclusive", "examples", "no_main"], + size = "small", + srcs = ["examples/export/onnx_tf.py"], + args = ["--framework=tf"], ) py_test( - name = "examples/two_trainer_workflow_tf", - main = "examples/two_trainer_workflow.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/two_trainer_workflow.py"], - args = ["--as-test", "--stop-reward=450.0"] + name = "examples/export/onnx_tf2", + main = "examples/export/onnx_tf.py", + tags = ["team:rllib", "exclusive", "examples", "no_main"], + size = "small", + srcs = ["examples/export/onnx_tf.py"], + args = ["--framework=tf2"], +) + +py_test( + name = "examples/export/onnx_torch", + main = "examples/export/onnx_torch.py", + tags = ["team:rllib", "exclusive", "examples", "no_main"], + size = "small", + srcs = ["examples/export/onnx_torch.py"], ) py_test( - name = "examples/two_trainer_workflow_torch", - main = "examples/two_trainer_workflow.py", + name = "examples/fractional_gpus", + main = "examples/fractional_gpus.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/two_trainer_workflow.py"], - args = ["--as-test", "--torch", "--stop-reward=450.0"] + srcs = ["examples/fractional_gpus.py"], + args = ["--as-test", "--stop-reward=40.0", "--num-gpus=0", "--num-workers=0"] ) py_test( - name = "examples/two_step_game_ppo_tf", - main = "examples/two_step_game.py", + name = "examples/hierarchical_training_tf", + main = "examples/hierarchical_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/two_step_game.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=7"] + srcs = ["examples/hierarchical_training.py"], + args = [ "--framework=tf", "--stop-reward=0.0"] ) py_test( - name = "examples/two_step_game_ppo_torch", - main = "examples/two_step_game.py", + name = "examples/hierarchical_training_torch", + main = "examples/hierarchical_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/two_step_game.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=7"] + srcs = ["examples/hierarchical_training.py"], + args = ["--framework=torch", "--stop-reward=0.0"] ) -# ---------------------- -# new connector examples -# ---------------------- - -# Framestacking examples only run in smoke-test mode (a few iters only). py_test( - name = "examples/connectors/connector_v2_frame_stacking_ppo", - main = "examples/connectors/connector_v2_frame_stacking.py", + name = "examples/multi_agent_two_trainers_tf", + main = "examples/multi_agent_two_trainers.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/connector_v2_frame_stacking.py"], - args = ["--enable-new-api-stack", "--stop-iter=2", "--framework=torch", "--algo=PPO"] + srcs = ["examples/multi_agent_two_trainers.py"], + args = ["--as-test", "--framework=tf", "--stop-reward=70"] ) py_test( - name = "examples/connectors/connector_v2_frame_stacking_multi_agent_ppo", - main = "examples/connectors/connector_v2_frame_stacking.py", + name = "examples/multi_agent_two_trainers_torch", + main = "examples/multi_agent_two_trainers.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/connectors/connector_v2_frame_stacking.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] + size = "small", + srcs = ["examples/multi_agent_two_trainers.py"], + args = ["--as-test", "--framework=torch", "--stop-reward=70"] ) -# Nested observation spaces (flattening). py_test( - name = "examples/connectors/connector_v2_nested_observation_spaces_ppo", - main = "examples/connectors/connector_v2_nested_observation_spaces.py", + name = "examples/offline_rl_torch", + main = "examples/offline_rl.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/connector_v2_nested_observation_spaces.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO"] + srcs = ["examples/offline_rl.py"], + args = ["--as-test", "--stop-reward=-300", "--stop-iters=1"] ) + py_test( - name = "examples/connectors/connector_v2_nested_observation_spaces_multi_agent_ppo", - main = "examples/connectors/connector_v2_nested_observation_spaces.py", + name = "examples/inference_and_serving/policy_inference_after_training_tf", + main = "examples/inference_and_serving/policy_inference_after_training.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/connector_v2_nested_observation_spaces.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--framework=torch", "--algo=PPO"] + srcs = ["examples/inference_and_serving/policy_inference_after_training.py"], + args = ["--stop-iters=3", "--framework=tf"] ) -# Prev-r/prev actions + LSTM example. py_test( - name = "examples/connectors/connector_v2_prev_actions_prev_rewards_ppo", - main = "examples/connectors/connector_v2_prev_actions_prev_rewards.py", + name = "examples/inference_and_serving/policy_inference_after_training_torch", + main = "examples/inference_and_serving/policy_inference_after_training.py", tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/connectors/connector_v2_prev_actions_prev_rewards.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=200.0", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] + size = "medium", + srcs = ["examples/inference_and_serving/policy_inference_after_training.py"], + args = ["--stop-iters=3", "--framework=torch"] ) py_test( - name = "examples/connectors/connector_v2_prev_actions_prev_rewards_multi_agent_ppo", - main = "examples/connectors/connector_v2_prev_actions_prev_rewards.py", + name = "examples/inference_and_serving/policy_inference_after_training_with_attention_tf", + main = "examples/inference_and_serving/policy_inference_after_training_with_attention.py", tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/connectors/connector_v2_prev_actions_prev_rewards.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] + size = "medium", + srcs = ["examples/inference_and_serving/policy_inference_after_training_with_attention.py"], + args = ["--stop-iters=2", "--framework=tf"] ) -# MeanStd filtering example. py_test( - name = "examples/connectors/connector_v2_mean_std_filtering_ppo", - main = "examples/connectors/connector_v2_mean_std_filtering.py", + name = "examples/inference_and_serving/policy_inference_after_training_with_attention_torch", + main = "examples/inference_and_serving/policy_inference_after_training_with_attention.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/connector_v2_mean_std_filtering.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=-300.0", "--framework=torch", "--algo=PPO", "--num-env-runners=2"] + srcs = ["examples/inference_and_serving/policy_inference_after_training_with_attention.py"], + args = ["--stop-iters=2", "--framework=torch"] ) py_test( - name = "examples/connectors/connector_v2_mean_std_filtering_multi_agent_ppo", - main = "examples/connectors/connector_v2_mean_std_filtering.py", + name = "examples/inference_and_serving/policy_inference_after_training_with_lstm_tf", + main = "examples/inference_and_serving/policy_inference_after_training_with_lstm.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/connector_v2_mean_std_filtering.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=-600.0", "--framework=torch", "--algo=PPO", "--num-env-runners=5", "--num-cpus=6"] + srcs = ["examples/inference_and_serving/policy_inference_after_training_with_lstm.py"], + args = ["--stop-iters=1", "--framework=tf"] ) -# ---------------------- -# old connector examples -# ---------------------- py_test( - name = "examples/connectors/v1/run_connector_policy", - main = "examples/connectors/v1/run_connector_policy.py", + name = "examples/inference_and_serving/policy_inference_after_training_with_lstm_torch", + main = "examples/inference_and_serving/policy_inference_after_training_with_lstm.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/connectors/v1/run_connector_policy.py"], + size = "medium", + srcs = ["examples/inference_and_serving/policy_inference_after_training_with_lstm.py"], + args = ["--stop-iters=1", "--framework=torch"] ) py_test( - name = "examples/connectors/v1/adapt_connector_policy", - main = "examples/connectors/v1/adapt_connector_policy.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/connectors/v1/adapt_connector_policy.py"], + name = "examples/replay_buffer_api", + tags = ["team:rllib", "examples"], + size = "large", + srcs = ["examples/replay_buffer_api.py"], ) py_test( - name = "examples/connectors/v1/self_play_with_policy_checkpoint", - main = "examples/connectors/v1/self_play_with_policy_checkpoint.py", + name = "examples/restore_1_of_n_agents_from_checkpoint", tags = ["team:rllib", "exclusive", "examples"], - size = "small", - srcs = ["examples/connectors/v1/self_play_with_policy_checkpoint.py"], - args = [ - "--train_iteration=1" # Smoke test. - ] + size = "medium", + srcs = ["examples/restore_1_of_n_agents_from_checkpoint.py"], + args = ["--pre-training-iters=1", "--stop-iters=1", "--num-cpus=4"] ) # -------------------------------------------------------------------- @@ -2986,34 +2918,6 @@ py_test( # NOTE: Add tests alphabetically to this list. # -------------------------------------------------------------------- -py_test( - name = "examples/learner/multi_agent_cartpole_ppo_torch", - main = "examples/learner/multi_agent_cartpole_ppo.py", - tags = ["team:rllib", "examples"], - size = "medium", - srcs = ["examples/learner/multi_agent_cartpole_ppo.py"], - args = ["--as-test", "--framework=torch", "--num-gpus=0"] -) - -py_test( - name = "examples/learner/multi_agent_cartpole_ppo_torch_gpu", - main = "examples/learner/multi_agent_cartpole_ppo.py", - tags = ["team:rllib", "exclusive", "examples", "gpu"], - size = "medium", - srcs = ["examples/learner/multi_agent_cartpole_ppo.py"], - args = ["--as-test", "--framework=torch", "--num-gpus=1"] -) - - -py_test( - name = "examples/learner/multi_agent_cartpole_ppo_torch_multi_gpu", - main = "examples/learner/multi_agent_cartpole_ppo.py", - tags = ["team:rllib", "exclusive", "examples", "multi_gpu"], - size = "medium", - srcs = ["examples/learner/multi_agent_cartpole_ppo.py"], - args = ["--as-test", "--framework=torch", "--num-gpus=2"] -) - py_test( name = "examples/learner/ppo_tuner_local_cpu_torch", main = "examples/learner/ppo_tuner.py", @@ -3120,38 +3024,6 @@ py_test( # NOTE: Add tests alphabetically to this list. # -------------------------------------------------------------------- -py_test( - name = "examples/documentation/custom_gym_env", - main = "examples/documentation/custom_gym_env.py", - tags = ["team:rllib", "documentation", "no_main"], - size = "medium", - srcs = ["examples/documentation/custom_gym_env.py"], -) - -py_test( - name = "examples/documentation/saving_and_loading_algos_and_policies", - main = "examples/documentation/saving_and_loading_algos_and_policies.py", - tags = ["team:rllib", "documentation", "no_main"], - size = "large", - srcs = ["examples/documentation/saving_and_loading_algos_and_policies.py"], -) - -py_test( - name = "examples/documentation/replay_buffer_demo", - main = "examples/documentation/replay_buffer_demo.py", - tags = ["team:rllib", "documentation", "no_main"], - size = "medium", - srcs = ["examples/documentation/replay_buffer_demo.py"], -) - -py_test( - name = "examples/documentation/rllib_on_ray_readme", - main = "examples/documentation/rllib_on_ray_readme.py", - tags = ["team:rllib", "documentation", "no_main"], - size = "medium", - srcs = ["examples/documentation/rllib_on_ray_readme.py"], -) - py_test( name = "examples/documentation/rllib_on_rllib_readme", main = "examples/documentation/rllib_on_rllib_readme.py", @@ -3165,14 +3037,6 @@ py_test( # -------------------------------------------------------------------- py_test_module_list( files = [ - "env/wrappers/tests/test_kaggle_wrapper.py", - "examples/env/tests/test_cliff_walking_wall_env.py", - "examples/env/tests/test_coin_game_non_vectorized_env.py", - "examples/env/tests/test_coin_game_vectorized_env.py", - "examples/env/tests/test_matrix_sequential_social_dilemma.py", - "examples/env/tests/test_wrappers.py", - "tests/test_dnc.py", - "tests/test_perf.py", "utils/tests/test_utils.py", ], size = "large", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 918c747e93162..17a03511f1447 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -3300,11 +3300,6 @@ def validate_config(self, config): pass -# TODO: Create a dict that throw a deprecation warning once we have fully moved -# to AlgorithmConfig() objects (some algos still missing). -COMMON_CONFIG: AlgorithmConfigDict = AlgorithmConfig(Algorithm).to_dict() - - class TrainIterCtx: def __init__(self, algo: Algorithm): self.algo = algo @@ -3369,9 +3364,9 @@ def should_stop(self, results): self.algo._counters[NUM_ENV_STEPS_TRAINED] - self.init_env_steps_trained ) - min_t = self.algo.config["min_time_s_per_iteration"] - min_sample_ts = self.algo.config["min_sample_timesteps_per_iteration"] - min_train_ts = self.algo.config["min_train_timesteps_per_iteration"] + min_t = self.algo.config.min_time_s_per_iteration + min_sample_ts = self.algo.config.min_sample_timesteps_per_iteration + min_train_ts = self.algo.config.min_train_timesteps_per_iteration # Repeat if not enough time has passed or if not enough # env|train timesteps have been processed (or these min # values are not provided by the user). diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index d9f78d3029537..9afd55d1a0cab 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -897,7 +897,11 @@ def build_env_to_module_connector(self, env): if self.is_multi_agent(): pipeline.append( AgentToModuleMapping( - modules=set(self.policies), + module_specs=( + self.rl_module_spec.module_specs + if isinstance(self.rl_module_spec, MultiAgentRLModuleSpec) + else set(self.policies) + ), agent_to_module_mapping_fn=self.policy_mapping_fn, ) ) @@ -1039,7 +1043,11 @@ def build_learner_connector(self, input_observation_space, input_action_space): if self.is_multi_agent(): pipeline.append( AgentToModuleMapping( - modules=set(self.policies), + module_specs=( + self.rl_module_spec.module_specs + if isinstance(self.rl_module_spec, MultiAgentRLModuleSpec) + else set(self.policies) + ), agent_to_module_mapping_fn=self.policy_mapping_fn, ) ) @@ -2852,8 +2860,8 @@ def rl_module_spec(self): if self._rl_module_spec is not None: # Merge provided RL Module spec class with defaults _check_rl_module_spec(self._rl_module_spec) - # We can only merge if we have SingleAgentRLModuleSpecs. - # TODO (sven): Support merging for MultiAgentRLModuleSpecs. + # Merge given spec with default one (in case items are missing, such as + # spaces, module class, etc.) if isinstance(self._rl_module_spec, SingleAgentRLModuleSpec): if isinstance(default_rl_module_spec, SingleAgentRLModuleSpec): default_rl_module_spec.update(self._rl_module_spec) @@ -2863,6 +2871,11 @@ def rl_module_spec(self): "Cannot merge MultiAgentRLModuleSpec with " "SingleAgentRLModuleSpec!" ) + else: + marl_module_spec = copy.deepcopy(self._rl_module_spec) + marl_module_spec.update(default_rl_module_spec) + return marl_module_spec + # `self._rl_module_spec` has not been user defined -> return default one. else: return default_rl_module_spec diff --git a/rllib/connectors/common/add_states_from_episodes_to_batch.py b/rllib/connectors/common/add_states_from_episodes_to_batch.py index 2b646d9898271..c3112bc2e3118 100644 --- a/rllib/connectors/common/add_states_from_episodes_to_batch.py +++ b/rllib/connectors/common/add_states_from_episodes_to_batch.py @@ -199,14 +199,23 @@ def __call__( # Also, let module-to-env pipeline know that we had added a single timestep # time rank to the data (to remove it again). if not self._as_learner_connector: - data = tree.map_structure( - # Expand on axis 0 (the to-be-time-dim) if item has not been batched' - # yet, otherwise axis=1 (the time-dim). - lambda s: np.expand_dims( - s, axis=(1 if isinstance(s, BatchedNdArray) else 0) - ), - data, - ) + for column, column_data in data.copy().items(): + self.foreach_batch_item_change_in_place( + batch=data, + column=column, + func=lambda item, eps_id, aid, mid: ( + item + if mid is not None and not rl_module[mid].is_stateful() + # Expand on axis 0 (the to-be-time-dim) if item has not been + # batched yet, otherwise axis=1 (the time-dim). + else tree.map_structure( + lambda s: np.expand_dims( + s, axis=(1 if isinstance(s, BatchedNdArray) else 0) + ), + item, + ) + ), + ) shared_data["_added_single_ts_time_rank"] = True else: # Before adding STATE_IN to the `data`, zero-pad existing data and batch @@ -239,6 +248,9 @@ def __call__( if isinstance(rl_module, MultiAgentRLModule) else rl_module ) + # This single-agent RLModule is NOT stateful -> Skip. + if not sa_module.is_stateful(): + continue if self.max_seq_len is None: raise ValueError( @@ -311,6 +323,9 @@ def __call__( sa_module = rl_module if sa_episode.module_id is not None: sa_module = rl_module[sa_episode.module_id] + # This single-agent RLModule is NOT stateful -> Skip. + if not sa_module.is_stateful(): + continue # Episode just started -> Get initial state from our RLModule. if sa_episode.t_started == 0 and len(sa_episode) == 0: diff --git a/rllib/connectors/common/agent_to_module_mapping.py b/rllib/connectors/common/agent_to_module_mapping.py index fa5e7776c0a9f..c304fa60a174e 100644 --- a/rllib/connectors/common/agent_to_module_mapping.py +++ b/rllib/connectors/common/agent_to_module_mapping.py @@ -1,13 +1,13 @@ from collections import defaultdict -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional import gymnasium as gym from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec from ray.rllib.env.multi_agent_episode import MultiAgentEpisode from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import EpisodeType +from ray.rllib.utils.typing import EpisodeType, ModuleID class AgentToModuleMapping(ConnectorV2): @@ -69,7 +69,7 @@ class AgentToModuleMapping(ConnectorV2): # Create our connector piece. connector = AgentToModuleMapping( - modules=["module0", "module1"], + module_specs={"module0", "module1"}, agent_to_module_mapping_fn=( lambda agent_id, eps: "module1" if agent_id == "agent1" else "module0" ), @@ -103,23 +103,23 @@ class AgentToModuleMapping(ConnectorV2): @override(ConnectorV2) def recompute_observation_space_from_input_spaces(self): - return self._map_space_if_necessary(self.input_observation_space) + return self._map_space_if_necessary(self.input_observation_space, "obs") @override(ConnectorV2) def recompute_action_space_from_input_spaces(self): - return self._map_space_if_necessary(self.input_action_space) + return self._map_space_if_necessary(self.input_action_space, "act") def __init__( self, input_observation_space: Optional[gym.Space] = None, input_action_space: Optional[gym.Space] = None, *, - modules, + module_specs: Dict[ModuleID, SingleAgentRLModuleSpec], agent_to_module_mapping_fn, ): super().__init__(input_observation_space, input_action_space) - self._modules = modules + self._module_specs = module_specs self._agent_to_module_mapping_fn = agent_to_module_mapping_fn @override(ConnectorV2) @@ -179,10 +179,10 @@ def __call__( return data_by_module - def _map_space_if_necessary(self, space): + def _map_space_if_necessary(self, space, which: str = "obs"): # Analyze input observation space to check, whether the user has already taken # care of the agent to module mapping. - if set(self._modules) == set(space.spaces.keys()): + if set(self._module_specs) == set(space.spaces.keys()): return space # We need to take care of agent to module mapping. Figure out the resulting @@ -190,7 +190,18 @@ def _map_space_if_necessary(self, space): dummy_eps = MultiAgentEpisode() ret_space = {} - for module_id in self._modules: + for module_id in self._module_specs: + # Easy way out, user has provided space in the RLModule spec dict. + if isinstance(self._module_specs, dict) and module_id in self._module_specs: + if which == "obs" and self._module_specs[module_id].observation_space: + ret_space[module_id] = self._module_specs[ + module_id + ].observation_space + continue + elif which == "act" and self._module_specs[module_id].action_space: + ret_space[module_id] = self._module_specs[module_id].action_space + continue + # Need to reverse map spaces (for the different agents) to certain # module IDs (using a dummy MultiAgentEpisode). one_space = next(iter(space.spaces.values())) @@ -202,6 +213,7 @@ def _map_space_if_necessary(self, space): # AgentIDs and find the agent ID that matches. else: match_aid = None + one_agent_for_module_found = False for aid in space.spaces.keys(): # Match: Assign spaces for this agentID to the PolicyID. if self._agent_to_module_mapping_fn(aid, dummy_eps) == module_id: @@ -219,5 +231,20 @@ def _map_space_if_necessary(self, space): ) ret_space[module_id] = space[aid] match_aid = aid + one_agent_for_module_found = True + # Still no space found for this module ID -> Error out. + if not one_agent_for_module_found: + raise ValueError( + f"Could not find or derive any {which}-space for RLModule " + f"{module_id}! This can happen if your `config.rl_module(rl_" + f"module_spec=...)` does NOT contain space information for this" + " particular single-agent module AND your agent-to-module-" + "mapping function is stochastic (such that for some agent A, " + "more than one ModuleID might be returned somewhat randomly). " + f"Fix this error by providing {which}-space information using " + "`config.rl_module(rl_module_spec=MultiAgentRLModuleSpec(" + f"module_specs={{'{module_id}': SingleAgentRLModuleSpec(" + "observation_space=..., action_space=...)}}))" + ) return gym.spaces.Dict(ret_space) diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index 0ad8c15aab857..73d602a592eeb 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -612,9 +612,112 @@ def _tag(s): def foreach_batch_item_change_in_place( batch: Dict[str, Any], column: Union[str, List[str], Tuple[str]], - func: Callable[[Any, int, AgentID, ModuleID], Any], + func: Callable[ + [Any, Optional[int], Optional[AgentID], Optional[ModuleID]], Any + ], ) -> None: + """Runs the provided `func` on all items under one or more columns in the batch. + + Use this method to conveniently loop through all items in a batch + and transform them in place. + + `func` takes the following as arguments: + - The item itself. If column is a list of column names, this argument is a tuple + of items. + - The EpisodeID. This value might be None. + - The AgentID. This value might be None in the single-agent case. + - The ModuleID. This value might be None in the single-agent case. + + The return value(s) of `func` are used to directly override the values in the + given `batch`. + + Args: + batch: The batch to process in-place. + column: A single column name (str) or a list thereof. If a list is provided, + the first argument to `func` is a tuple of items. If a single + str is provided, the first argument to `func` is an individual + item. + func: The function to call on each item or tuple of item(s). + + .. testcode:: + + from ray.rllib.connectors.connector_v2 import ConnectorV2 + from ray.rllib.utils.test_utils import check + + # Simple case: Batch items are in lists directly under their column names. + batch = { + "col1": [0, 1, 2, 3], + "col2": [0, -1, -2, -3], + } + # Increase all ints by 1. + ConnectorV2.foreach_batch_item_change_in_place( + batch=batch, + column="col1", + func=lambda item, *args: item + 1, + ) + check(batch["col1"], [1, 2, 3, 4]) + + # Further increase all ints by 1 in col1 and flip sign in col2. + ConnectorV2.foreach_batch_item_change_in_place( + batch=batch, + column=["col1", "col2"], + func=(lambda items, *args: (items[0] + 1, -items[1])), + ) + check(batch["col1"], [2, 3, 4, 5]) + check(batch["col2"], [0, 1, 2, 3]) + + # Single-agent case: Batch items are in lists under (eps_id,)-keys in a dict + # under their column names. + batch = { + "col1": { + ("eps1",): [0, 1, 2, 3], + ("eps2",): [400, 500, 600], + }, + } + # Increase all ints of eps1 by 1 and divide all ints of eps2 by 100. + ConnectorV2.foreach_batch_item_change_in_place( + batch=batch, + column="col1", + func=lambda item, eps_id, *args: ( + item + 1 if eps_id == "eps1" else item / 100 + ), + ) + check(batch["col1"], { + ("eps1",): [1, 2, 3, 4], + ("eps2",): [4, 5, 6], + }) + + # Multi-agent case: Batch items are in lists under + # (eps_id, agent_id, module_id)-keys in a dict + # under their column names. + batch = { + "col1": { + ("eps1", "ag1", "mod1"): [1, 2, 3, 4], + ("eps2", "ag1", "mod2"): [400, 500, 600], + ("eps2", "ag2", "mod3"): [-1, -2, -3, -4, -5], + }, + } + # Decrease all ints of "eps1" by 1, divide all ints of "mod2" by 100, and + # flip sign of all ints of "ag2". + ConnectorV2.foreach_batch_item_change_in_place( + batch=batch, + column="col1", + func=lambda item, eps_id, ag_id, mod_id: ( + item - 1 + if eps_id == "eps1" + else item / 100 + if mod_id == "mod2" + else -item + ), + ) + check(batch["col1"], { + ("eps1", "ag1", "mod1"): [0, 1, 2, 3], + ("eps2", "ag1", "mod2"): [4, 5, 6], + ("eps2", "ag2", "mod3"): [1, 2, 3, 4, 5], + }) + """ data_to_process = [batch.get(c) for c in force_list(column)] + single_col = isinstance(column, str) if any(d is None for d in data_to_process): raise ValueError( f"Invalid column name(s) ({column})! One or more not found in " @@ -626,11 +729,13 @@ def foreach_batch_item_change_in_place( if isinstance(data_to_process[0], list): for list_pos, data_tuple in enumerate(zip(*data_to_process)): results = func( - data_tuple[0] if isinstance(column, str) else data_tuple, + data_tuple[0] if single_col else data_tuple, None, # episode_id None, # agent_id None, # module_id ) + # Tuple'ize results if single_col. + results = (results,) if single_col else results for col_slot, result in enumerate(force_list(results)): data_to_process[col_slot][list_pos] = result # Single-agent/multi-agent cases. @@ -649,12 +754,14 @@ def foreach_batch_item_change_in_place( other_lists = [d[key] for d in data_to_process[1:]] for list_pos, data_tuple in enumerate(zip(d0_list, *other_lists)): results = func( - data_tuple[0] if isinstance(column, str) else data_tuple, + data_tuple[0] if single_col else data_tuple, eps_id, agent_id, module_id, ) - for col_slot, result in enumerate(force_list(results)): + # Tuple'ize results if single_col. + results = (results,) if single_col else results + for col_slot, result in enumerate(results): data_to_process[col_slot][key][list_pos] = result @staticmethod diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index 568708b9baf71..1958f9e871d17 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional +from typing import Any, Collection, List, Optional import gymnasium as gym from gymnasium.spaces import Box @@ -11,7 +11,7 @@ from ray.rllib.utils.annotations import override from ray.rllib.utils.numpy import flatten_inputs_to_1d_tensor from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space -from ray.rllib.utils.typing import EpisodeType +from ray.rllib.utils.typing import AgentID, EpisodeType from ray.util.annotations import PublicAPI @@ -106,17 +106,20 @@ def recompute_observation_space_from_input_spaces(self): if self._multi_agent: spaces = {} for agent_id, space in self._input_obs_base_struct.items(): - sample = flatten_inputs_to_1d_tensor( - tree.map_structure( - lambda s: s.sample(), + if self._agent_ids and agent_id not in self._agent_ids: + spaces[agent_id] = self._input_obs_base_struct[agent_id] + else: + sample = flatten_inputs_to_1d_tensor( + tree.map_structure( + lambda s: s.sample(), + self._input_obs_base_struct[agent_id], + ), self._input_obs_base_struct[agent_id], - ), - self._input_obs_base_struct[agent_id], - batch_axis=False, - ) - spaces[agent_id] = Box( - float("-inf"), float("inf"), (len(sample),), np.float32 - ) + batch_axis=False, + ) + spaces[agent_id] = Box( + float("-inf"), float("inf"), (len(sample),), np.float32 + ) return gym.spaces.Dict(spaces) else: sample = flatten_inputs_to_1d_tensor( @@ -135,6 +138,7 @@ def __init__( input_action_space: Optional[gym.Space] = None, *, multi_agent: bool = False, + agent_ids: Optional[Collection[AgentID]] = None, **kwargs, ): """Initializes a FlattenObservations instance. @@ -143,9 +147,14 @@ def __init__( multi_agent: Whether this connector operates on multi-agent observations, in which case, the top-level of the Dict space (where agent IDs are mapped to individual agents' observation spaces) is left as-is. + agent_ids: If multi_agent is True, this argument defines a collection of + AgentIDs for which to flatten. AgentIDs not in this collection are + ignored. + If None, flatten observations for all AgentIDs. None is the default. """ self._input_obs_base_struct = None self._multi_agent = multi_agent + self._agent_ids = agent_ids super().__init__(input_observation_space, input_action_space, **kwargs) @@ -176,17 +185,23 @@ def __call__( batch=data, column=Columns.OBS, func=( - lambda item, eps_id, agent_id, module_id: flatten_inputs_to_1d_tensor( - item, - # In the multi-agent case, we need to use the specific agent's space - # struct, not the multi-agent observation space dict. - ( - self._input_obs_base_struct - if not agent_id - else self._input_obs_base_struct[agent_id] - ), - # Our items are bare observations (no batch axis present). - batch_axis=False, + lambda item, eps_id, agent_id, module_id: ( + # Multi-agent AND skip this AgentID. + item + if self._agent_ids and agent_id not in self._agent_ids + # Single-agent or flatten this AgentIDs observation. + else flatten_inputs_to_1d_tensor( + item, + # In the multi-agent case, we need to use the specific agent's + # space struct, not the multi-agent observation space dict. + ( + self._input_obs_base_struct + if not agent_id + else self._input_obs_base_struct[agent_id] + ), + # Our items are bare observations (no batch axis present). + batch_axis=False, + ) ) ), ) diff --git a/rllib/connectors/module_to_env/normalize_and_clip_actions.py b/rllib/connectors/module_to_env/normalize_and_clip_actions.py index cc07e478599e8..8a95fa725c4f8 100644 --- a/rllib/connectors/module_to_env/normalize_and_clip_actions.py +++ b/rllib/connectors/module_to_env/normalize_and_clip_actions.py @@ -75,7 +75,7 @@ def __call__( environment's action space and thus don't lead to an error. """ - def _unsquash_or_clip(action_for_env, env_vector_idx, agent_id, module_id): + def _unsquash_or_clip(action_for_env, env_id, agent_id, module_id): if agent_id is not None: struct = self._action_space_struct[agent_id] else: diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py index 4939564130789..91394e671986f 100644 --- a/rllib/core/rl_module/marl_module.py +++ b/rllib/core/rl_module/marl_module.py @@ -537,20 +537,20 @@ def build(self, module_id: Optional[ModuleID] = None) -> RLModule: def add_modules( self, module_specs: Dict[ModuleID, SingleAgentRLModuleSpec], - overwrite: bool = True, + override: bool = True, ) -> None: """Add new module specs to the spec or updates existing ones. Args: module_specs: The mapping for the module_id to the single-agent module specs to be added to this multi-agent module spec. - overwrite: Whether to overwrite the existing module specs if they already - exist. If False, they will be updated only. + override: Whether to override the existing module specs if they already + exist. If False, they are only updated. """ if self.module_specs is None: self.module_specs = {} for module_id, module_spec in module_specs.items(): - if overwrite or module_id not in self.module_specs: + if override or module_id not in self.module_specs: self.module_specs[module_id] = module_spec else: self.module_specs[module_id].update(module_spec) @@ -607,7 +607,11 @@ def from_dict(cls, d) -> "MultiAgentRLModuleSpec": }, ) - def update(self, other: "MultiAgentRLModuleSpec", overwrite=False) -> None: + def update( + self, + other: Union["MultiAgentRLModuleSpec", SingleAgentRLModuleSpec], + override=False, + ) -> None: """Updates this spec with the other spec. Traverses this MultiAgentRLModuleSpec's module_specs and updates them with @@ -615,13 +619,14 @@ def update(self, other: "MultiAgentRLModuleSpec", overwrite=False) -> None: Args: other: The other spec to update this spec with. - overwrite: Whether to overwrite the existing module specs if they already - exist. If False, they will be updated only. + override: Whether to override the existing module specs if they already + exist. If False, they are only updated. """ - assert type(other) is MultiAgentRLModuleSpec - - if isinstance(other.module_specs, dict): - self.add_modules(other.module_specs, overwrite=overwrite) + if isinstance(other, SingleAgentRLModuleSpec): + for mid, spec in self.module_specs.items(): + self.module_specs[mid].update(other, override=False) + elif isinstance(other.module_specs, dict): + self.add_modules(other.module_specs, override=override) else: if not self.module_specs: self.module_specs = other.module_specs diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index c641cb22a3f4a..dfba4b72896ba 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -147,19 +147,34 @@ def from_dict(cls, d): ) return spec - def update(self, other) -> None: - """Updates this spec with the given other spec. Works like dict.update().""" + def update(self, other, override: bool = True) -> None: + """Updates this spec with the given other spec. Works like dict.update(). + + Args: + other: The other SingleAgentRLModule spec to update this one from. + override: Whether to update all properties in `self` with those of `other. + If False, only update those properties in `self` that are not None. + """ if not isinstance(other, SingleAgentRLModuleSpec): raise ValueError("Can only update with another SingleAgentRLModuleSpec.") # If the field is None in the other, keep the current field, otherwise update # with the new value. - self.module_class = other.module_class or self.module_class - self.observation_space = other.observation_space or self.observation_space - self.action_space = other.action_space or self.action_space - self.model_config_dict = other.model_config_dict or self.model_config_dict - self.catalog_class = other.catalog_class or self.catalog_class - self.load_state_path = other.load_state_path or self.load_state_path + if override: + self.module_class = other.module_class or self.module_class + self.observation_space = other.observation_space or self.observation_space + self.action_space = other.action_space or self.action_space + self.model_config_dict = other.model_config_dict or self.model_config_dict + self.catalog_class = other.catalog_class or self.catalog_class + self.load_state_path = other.load_state_path or self.load_state_path + # Only override, if the field is None in `self`. + else: + self.module_class = self.module_class or other.module_class + self.observation_space = self.observation_space or other.observation_space + self.action_space = self.action_space or other.action_space + self.model_config_dict = self.model_config_dict or other.model_config_dict + self.catalog_class = self.catalog_class or other.catalog_class + self.load_state_path = self.load_state_path or other.load_state_path def as_multi_agent(self) -> "MultiAgentRLModuleSpec": """Returns a MultiAgentRLModuleSpec (`self` under DEFAULT_POLICY_ID key).""" diff --git a/rllib/core/rl_module/tests/test_rl_module_specs.py b/rllib/core/rl_module/tests/test_rl_module_specs.py index 24c8ecf696c32..a90ac507c7f0f 100644 --- a/rllib/core/rl_module/tests/test_rl_module_specs.py +++ b/rllib/core/rl_module/tests/test_rl_module_specs.py @@ -231,7 +231,7 @@ def test_update_specs_multi_agent(self): self.assertEqual( marl_spec_1.module_specs["agent_1"].model_config_dict, "Update me!" ) - marl_spec_1.update(marl_spec_2, overwrite=True) + marl_spec_1.update(marl_spec_2, override=True) self.assertEqual(marl_spec_1.module_specs["agent_1"], module_spec_2) # Test if updating MultiAgentRLModuleSpec without overwriting works. This @@ -244,7 +244,7 @@ def test_update_specs_multi_agent(self): self.assertEqual( marl_spec_3.module_specs["agent_1"].observation_space, "Do not update me!" ) - marl_spec_3.update(marl_spec_2, overwrite=False) + marl_spec_3.update(marl_spec_2, override=False) # If we would overwrite, we would replace the observation space even though # it was None. This is not the case here. self.assertEqual( diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py index 9710fb7882af9..47bef8ec9c265 100644 --- a/rllib/env/multi_agent_env.py +++ b/rllib/env/multi_agent_env.py @@ -310,7 +310,8 @@ def with_agent_groups( self, groups: Dict[str, List[AgentID]], obs_space: gym.Space = None, - act_space: gym.Space = None) -> "MultiAgentEnv": + act_space: gym.Space = None, + ) -> "MultiAgentEnv": """Convenience method for grouping together agents in this env. An agent group is a list of agent IDs that are mapped to a single diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py index f118c4f0dde54..31627b948a832 100644 --- a/rllib/env/wrappers/pettingzoo_env.py +++ b/rllib/env/wrappers/pettingzoo_env.py @@ -125,8 +125,14 @@ def __init__(self, env): self._agent_ids = set(self.env.agents) - self.observation_space = gym.spaces.Dict(self.env.observation_spaces) - self.action_space = gym.spaces.Dict(self.env.action_spaces) + self.observation_space = gym.spaces.Dict( + {aid: self.env.observation_space(aid) for aid in self._agent_ids} + ) + self._obs_space_in_preferred_format = True + self.action_space = gym.spaces.Dict( + {aid: self.env.action_space(aid) for aid in self._agent_ids} + ) + self._action_space_in_preferred_format = True def observation_space_sample(self, agent_ids: list = None) -> MultiAgentDict: sample = self.observation_space.sample() @@ -195,8 +201,14 @@ def __init__(self, env): self.par_env.reset() self._agent_ids = set(self.par_env.agents) - self.observation_space = gym.spaces.Dict(self.par_env.observation_spaces) - self.action_space = gym.spaces.Dict(self.par_env.action_spaces) + self.observation_space = gym.spaces.Dict( + {aid: self.par_env.observation_space(aid) for aid in self._agent_ids} + ) + self._obs_space_in_preferred_format = True + self.action_space = gym.spaces.Dict( + {aid: self.par_env.action_space(aid) for aid in self._agent_ids} + ) + self._action_space_in_preferred_format = True def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): obs, info = self.par_env.reset(seed=seed, options=options) diff --git a/rllib/env/wrappers/tests/test_kaggle_wrapper.py b/rllib/env/wrappers/tests/test_kaggle_wrapper.py deleted file mode 100644 index 2b0ed873f3f59..0000000000000 --- a/rllib/env/wrappers/tests/test_kaggle_wrapper.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest - - -class TestKaggleFootballMultiAgentEnv(unittest.TestCase): - def test_football_env(self): - from ray.rllib.env.wrappers.kaggle_wrapper import KaggleFootballMultiAgentEnv - - env = KaggleFootballMultiAgentEnv() - obs, info = env.reset() - self.assertEqual(list(obs.keys()), ["agent0", "agent1"]) - action_dict = {"agent0": 0, "agent1": 0} - obs, reward, done, truncated, info = env.step(action_dict) - self.assertEqual(list(obs.keys()), ["agent0", "agent1"]) - self.assertEqual(reward, {"agent0": 0, "agent1": 0}) - self.assertEqual( - done, - { - "agent0": False, - "agent1": False, - "__all__": False, - }, - ) - self.assertEqual(info, {"agent0": {}, "agent1": {}}) - - def test_football_env_run_30_steps(self): - from ray.rllib.env.wrappers.kaggle_wrapper import KaggleFootballMultiAgentEnv - - env = KaggleFootballMultiAgentEnv() - - # use the built-in agents in the kaggle environment - run_right_agent = env.kaggle_env.agents["run_right"] - do_nothing_agent = env.kaggle_env.agents["do_nothing"] - - obs, info = env.reset() - self.assertEqual(list(obs.keys()), ["agent0", "agent1"]) - done = {"__all__": False} - num_steps_completed = 0 - while not done["__all__"] and num_steps_completed <= 30: - action0 = run_right_agent(structify(obs["agent0"]))[0] - action1 = do_nothing_agent(structify(obs["agent1"]))[0] - action_dict = {"agent0": action0, "agent1": action1} - obs, _, done, truncated, _ = env.step(action_dict) - num_steps_completed += 1 - - def test_kaggle_football_agent_spaces(self): - from ray.rllib.env.wrappers.kaggle_wrapper import KaggleFootballMultiAgentEnv - - env = KaggleFootballMultiAgentEnv() - obs, info = env.reset() - action_space, obs_space = env.build_agent_spaces() - self.assertTrue(obs_space.contains(obs["agent0"])) - self.assertTrue(obs_space.contains(obs["agent1"])) - - action_dict = { - "agent0": action_space.sample(), - "agent1": action_space.sample(), - } - obs, _, _, _, _ = env.step(action_dict) - self.assertTrue(obs_space.contains(obs["agent0"])) - self.assertTrue(obs_space.contains(obs["agent1"])) - - -if __name__ == "__main__": - from kaggle_environments.utils import structify - import sys - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/tests/test_recsim_wrapper.py b/rllib/env/wrappers/tests/test_recsim_wrapper.py deleted file mode 100644 index 2aec92c92b24d..0000000000000 --- a/rllib/env/wrappers/tests/test_recsim_wrapper.py +++ /dev/null @@ -1,46 +0,0 @@ -import gymnasium as gym -import unittest - -from ray.rllib.examples.env.recommender_system_envs_with_recsim import ( - InterestEvolutionRecSimEnv, -) -from ray.rllib.env.wrappers.recsim import MultiDiscreteToDiscreteActionWrapper -from ray.rllib.utils.error import UnsupportedSpaceException - - -class TestRecSimWrapper(unittest.TestCase): - def test_observation_space(self): - env = InterestEvolutionRecSimEnv() - obs, info = env.reset() - self.assertTrue( - env.observation_space.contains(obs), - f"{env.observation_space} doesn't contain {obs}", - ) - new_obs, _, _, _, _ = env.step(env.action_space.sample()) - self.assertTrue(env.observation_space.contains(new_obs)) - - def test_action_space_conversion(self): - env = InterestEvolutionRecSimEnv({"convert_to_discrete_action_space": True}) - self.assertIsInstance(env.action_space, gym.spaces.Discrete) - env.reset() - action = env.action_space.sample() - self.assertTrue(env.action_space.contains(action)) - new_obs, _, _, _, _ = env.step(action) - self.assertTrue(env.observation_space.contains(new_obs)) - - def test_bandits_observation_space_conversion(self): - env = InterestEvolutionRecSimEnv({"wrap_for_bandits": True}) - # "item" of observation space is a Box space. - self.assertIsInstance(env.observation_space["item"], gym.spaces.Box) - - def test_double_action_space_conversion_raises_exception(self): - env = InterestEvolutionRecSimEnv({"convert_to_discrete_action_space": True}) - with self.assertRaises(UnsupportedSpaceException): - env = MultiDiscreteToDiscreteActionWrapper(env) - - -if __name__ == "__main__": - import sys - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/examples/env/tests/__init__.py b/rllib/examples/_old_api_stack/__init__.py similarity index 100% rename from rllib/examples/env/tests/__init__.py rename to rllib/examples/_old_api_stack/__init__.py diff --git a/rllib/examples/_old_api_stack/attention_net_supervised.py b/rllib/examples/_old_api_stack/attention_net_supervised.py new file mode 100644 index 0000000000000..d5615f8f042fb --- /dev/null +++ b/rllib/examples/_old_api_stack/attention_net_supervised.py @@ -0,0 +1,76 @@ +from gymnasium.spaces import Box, Discrete +import numpy as np + +from rllib.models.tf.attention_net import TrXLNet +from ray.rllib.utils.framework import try_import_tf + +tf1, tf, tfv = try_import_tf() + + +def bit_shift_generator(seq_length, shift, batch_size): + while True: + values = np.array([0.0, 1.0], dtype=np.float32) + seq = np.random.choice(values, (batch_size, seq_length, 1)) + targets = np.squeeze(np.roll(seq, shift, axis=1).astype(np.int32)) + targets[:, :shift] = 0 + yield seq, targets + + +def train_loss(targets, outputs): + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=targets, logits=outputs + ) + return tf.reduce_mean(loss) + + +def train_bit_shift(seq_length, num_iterations, print_every_n): + + optimizer = tf.keras.optimizers.Adam(1e-3) + + model = TrXLNet( + observation_space=Box(low=0, high=1, shape=(1,), dtype=np.int32), + action_space=Discrete(2), + num_outputs=2, + model_config={"max_seq_len": seq_length}, + name="trxl", + num_transformer_units=1, + attention_dim=10, + num_heads=5, + head_dim=20, + position_wise_mlp_dim=20, + ) + + shift = 10 + train_batch = 10 + test_batch = 100 + data_gen = bit_shift_generator(seq_length, shift=shift, batch_size=train_batch) + test_gen = bit_shift_generator(seq_length, shift=shift, batch_size=test_batch) + + @tf.function + def update_step(inputs, targets): + model_out = model( + {"obs": inputs}, + state=[tf.reshape(inputs, [-1, seq_length, 1])], + seq_lens=np.full(shape=(train_batch,), fill_value=seq_length), + ) + optimizer.minimize( + lambda: train_loss(targets, model_out), lambda: model.trainable_variables + ) + + for i, (inputs, targets) in zip(range(num_iterations), data_gen): + inputs_in = np.reshape(inputs, [-1, 1]) + targets_in = np.reshape(targets, [-1]) + update_step(tf.convert_to_tensor(inputs_in), tf.convert_to_tensor(targets_in)) + + if i % print_every_n == 0: + test_inputs, test_targets = next(test_gen) + print(i, train_loss(test_targets, model(test_inputs))) + + +if __name__ == "__main__": + tf.enable_eager_execution() + train_bit_shift( + seq_length=20, + num_iterations=2000, + print_every_n=200, + ) diff --git a/rllib/examples/_old_api_stack/complex_struct_space.py b/rllib/examples/_old_api_stack/complex_struct_space.py new file mode 100644 index 0000000000000..70125d2ac8d96 --- /dev/null +++ b/rllib/examples/_old_api_stack/complex_struct_space.py @@ -0,0 +1,57 @@ +"""Example of using variable-length Repeated / struct observation spaces. + +This example demonstrates the following: + - using a custom environment with Repeated / struct observations + - using a custom model to view the batched list observations + +For PyTorch / TF eager mode, use the `--framework=[torch|tf2]` flag. +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.models import ModelCatalog +from ray.rllib.examples.env.simple_rpg import SimpleRPG +from ray.rllib.examples.models.simple_rpg_model import ( + CustomTorchRPGModel, + CustomTFRPGModel, +) + +parser = argparse.ArgumentParser() +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="tf2", + help="The DL framework specifier.", +) + +if __name__ == "__main__": + ray.init() + args = parser.parse_args() + if args.framework == "torch": + ModelCatalog.register_custom_model("my_model", CustomTorchRPGModel) + else: + ModelCatalog.register_custom_model("my_model", CustomTFRPGModel) + + config = ( + PPOConfig() + .environment(SimpleRPG) + .framework(args.framework) + .rollouts(rollout_fragment_length=1, num_rollout_workers=0) + .training(train_batch_size=2, model={"custom_model": "my_model"}) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + stop = { + "timesteps_total": 1, + } + + tuner = tune.Tuner( + "PPO", + param_space=config.to_dict(), + run_config=air.RunConfig(stop=stop, verbose=1), + ) diff --git a/rllib/examples/connectors/v1/adapt_connector_policy.py b/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py similarity index 98% rename from rllib/examples/connectors/v1/adapt_connector_policy.py rename to rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py index a2a2700f82c7d..b4dcb535b230e 100644 --- a/rllib/examples/connectors/v1/adapt_connector_policy.py +++ b/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py @@ -11,7 +11,7 @@ from ray.rllib.connectors.connector import ConnectorContext from ray.rllib.connectors.action.lambdas import register_lambda_action_connector from ray.rllib.connectors.agent.lambdas import register_lambda_agent_connector -from ray.rllib.examples.connectors.v1.prepare_checkpoint import ( +from ray.rllib.examples._old_api_stack.connectors.prepare_checkpoint import ( # For demo purpose only. Would normally not need this. create_appo_cartpole_checkpoint, ) diff --git a/rllib/examples/connectors/v1/prepare_checkpoint.py b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py similarity index 100% rename from rllib/examples/connectors/v1/prepare_checkpoint.py rename to rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py diff --git a/rllib/examples/connectors/v1/run_connector_policy.py b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py similarity index 95% rename from rllib/examples/connectors/v1/run_connector_policy.py rename to rllib/examples/_old_api_stack/connectors/run_connector_policy.py index 5ddc1550578a0..8a84763f99af5 100644 --- a/rllib/examples/connectors/v1/run_connector_policy.py +++ b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py @@ -6,7 +6,7 @@ import os import tempfile -from ray.rllib.examples.connectors.v1.prepare_checkpoint import ( +from ray.rllib.examples._old_api_stack.connectors.prepare_checkpoint import ( # For demo purpose only. Would normally not need this. create_appo_cartpole_checkpoint, ) diff --git a/rllib/examples/connectors/v1/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py similarity index 98% rename from rllib/examples/connectors/v1/self_play_with_policy_checkpoint.py rename to rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py index a4b1dedfb77de..0c85f0fbcc781 100644 --- a/rllib/examples/connectors/v1/self_play_with_policy_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py @@ -14,7 +14,7 @@ from ray.rllib.algorithms.sac import SACConfig from ray.rllib.env.utils import try_import_pyspiel from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv -from ray.rllib.examples.connectors.v1.prepare_checkpoint import ( +from ray.rllib.examples._old_api_stack.connectors.prepare_checkpoint import ( create_open_spiel_checkpoint, ) from ray.rllib.policy.policy import Policy diff --git a/rllib/examples/_old_api_stack/custom_keras_model.py b/rllib/examples/_old_api_stack/custom_keras_model.py new file mode 100644 index 0000000000000..980a12004559a --- /dev/null +++ b/rllib/examples/_old_api_stack/custom_keras_model.py @@ -0,0 +1,153 @@ +"""Example of using a custom ModelV2 Keras-style model.""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.rllib.algorithms.callbacks import DefaultCallbacks +from ray.rllib.algorithms.dqn.dqn import DQNConfig +from ray.rllib.algorithms.dqn.distributional_q_tf_model import DistributionalQTFModel +from ray.rllib.models import ModelCatalog +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.visionnet import VisionNetwork as MyVisionNetwork +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY +from ray.tune.registry import get_trainable_cls + +tf1, tf, tfv = try_import_tf() + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="DQN", help="The RLlib-registered algorithm to use." +) +parser.add_argument("--stop", type=int, default=200) +parser.add_argument("--use-vision-network", action="store_true") +parser.add_argument("--num-cpus", type=int, default=0) + + +class MyKerasModel(TFModelV2): + """Custom model for policy gradient algorithms.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super(MyKerasModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + self.inputs = tf.keras.layers.Input(shape=obs_space.shape, name="observations") + layer_1 = tf.keras.layers.Dense( + 128, + name="my_layer1", + activation=tf.nn.relu, + kernel_initializer=normc_initializer(1.0), + )(self.inputs) + layer_out = tf.keras.layers.Dense( + num_outputs, + name="my_out", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(layer_1) + value_out = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(layer_1) + self.base_model = tf.keras.Model(self.inputs, [layer_out, value_out]) + + def forward(self, input_dict, state, seq_lens): + model_out, self._value_out = self.base_model(input_dict["obs"]) + return model_out, state + + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + def metrics(self): + return {"foo": tf.constant(42.0)} + + +class MyKerasQModel(DistributionalQTFModel): + """Custom model for DQN.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name, **kw): + super(MyKerasQModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name, **kw + ) + + # Define the core model layers which are used by the other + # output heads of DistributionalQModel + self.inputs = tf.keras.layers.Input(shape=obs_space.shape, name="observations") + layer_1 = tf.keras.layers.Dense( + 128, + name="my_layer1", + activation=tf.nn.relu, + kernel_initializer=normc_initializer(1.0), + )(self.inputs) + layer_out = tf.keras.layers.Dense( + num_outputs, + name="my_out", + activation=tf.nn.relu, + kernel_initializer=normc_initializer(1.0), + )(layer_1) + self.base_model = tf.keras.Model(self.inputs, layer_out) + + # Implement the core forward method. + def forward(self, input_dict, state, seq_lens): + model_out = self.base_model(input_dict["obs"]) + return model_out, state + + def metrics(self): + return {"foo": tf.constant(42.0)} + + +if __name__ == "__main__": + args = parser.parse_args() + ray.init(num_cpus=args.num_cpus or None) + ModelCatalog.register_custom_model( + "keras_model", MyVisionNetwork if args.use_vision_network else MyKerasModel + ) + ModelCatalog.register_custom_model( + "keras_q_model", MyVisionNetwork if args.use_vision_network else MyKerasQModel + ) + + # Tests https://github.com/ray-project/ray/issues/7293 + class MyCallbacks(DefaultCallbacks): + def on_train_result(self, *, algorithm, result, **kwargs): + r = result["result"]["info"][LEARNER_INFO] + if DEFAULT_POLICY_ID in r: + r = r[DEFAULT_POLICY_ID].get(LEARNER_STATS_KEY, r[DEFAULT_POLICY_ID]) + assert r["model"]["foo"] == 42, result + + config = ( + get_trainable_cls(args.run) + .get_default_config() + .environment("ALE/Breakout-v5" if args.use_vision_network else "CartPole-v1") + .framework("tf") + .callbacks(MyCallbacks) + .training( + model={ + "custom_model": "keras_q_model" if args.run == "DQN" else "keras_model" + } + ) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + if args.run == "DQN": + config = ( + DQNConfig() + .update_from_dict(config.to_dict()) + .training(num_steps_sampled_before_learning_starts=0) + ) + + stop = { + "episode_reward_mean": args.stop, + } + + tuner = tune.Tuner( + args.run, + param_space=config, + run_config=air.RunConfig(stop=stop), + ) + tuner.fit() diff --git a/rllib/examples/_old_api_stack/parametric_actions_cartpole.py b/rllib/examples/_old_api_stack/parametric_actions_cartpole.py new file mode 100644 index 0000000000000..3bc4d05bc8451 --- /dev/null +++ b/rllib/examples/_old_api_stack/parametric_actions_cartpole.py @@ -0,0 +1,110 @@ +"""Example of handling variable length and/or parametric action spaces. + +This toy example demonstrates the action-embedding based approach for handling large +discrete action spaces (potentially infinite in size), similar to this example: + + https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ + +This example works with RLlib's policy gradient style algorithms +(e.g., PG, PPO, IMPALA, A2C) and DQN. + +Note that since the model outputs now include "-inf" tf.float32.min +values, not all algorithm options are supported. For example, +algorithms might crash if they don't properly ignore the -inf action scores. +Working configurations are given below. +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.rllib.examples.env.parametric_actions_cartpole import ParametricActionsCartPole +from ray.rllib.examples.models.parametric_actions_model import ( + ParametricActionsModel, + TorchParametricActionsModel, +) +from ray.rllib.models import ModelCatalog +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import register_env + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=200, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." +) + +if __name__ == "__main__": + args = parser.parse_args() + ray.init() + + register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) + ModelCatalog.register_custom_model( + "pa_model", + TorchParametricActionsModel + if args.framework == "torch" + else ParametricActionsModel, + ) + + if args.run == "DQN": + cfg = { + # TODO(ekl) we need to set these to prevent the masked values + # from being further processed in DistributionalQModel, which + # would mess up the masking. It is possible to support these if we + # defined a custom DistributionalQModel that is aware of masking. + "hiddens": [], + "dueling": False, + } + else: + cfg = {} + + config = dict( + { + "env": "pa_cartpole", + "model": { + "custom_model": "pa_model", + }, + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), + "num_workers": 0, + "framework": args.framework, + }, + **cfg + ) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.Tuner( + args.run, + run_config=air.RunConfig(stop=stop, verbose=1), + param_space=config, + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py b/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py new file mode 100644 index 0000000000000..a6f2f57156837 --- /dev/null +++ b/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py @@ -0,0 +1,98 @@ +"""Example of handling variable length and/or parametric action spaces. + +This is a toy example of the action-embedding based approach for handling large +discrete action spaces (potentially infinite in size), similar to this: + + https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ + +This currently works with RLlib's policy gradient style algorithms +(e.g., PG, PPO, IMPALA, A2C) and also DQN. + +Note that since the model outputs now include "-inf" tf.float32.min +values, not all algorithm options are supported at the moment. For example, +algorithms might crash if they don't properly ignore the -inf action scores. +Working configurations are given below. +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.rllib.examples.env.parametric_actions_cartpole import ( + ParametricActionsCartPoleNoEmbeddings, +) +from ray.rllib.examples.models.parametric_actions_model import ( + ParametricActionsModelThatLearnsEmbeddings, +) +from ray.rllib.models import ModelCatalog +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import register_env + +parser = argparse.ArgumentParser() +parser.add_argument("--run", type=str, default="PPO") +parser.add_argument( + "--framework", + choices=["tf", "tf2"], + default="tf", + help="The DL framework specifier (Torch not supported " + "due to the lack of a model).", +) +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-reward", type=float, default=150.0) +parser.add_argument("--stop-timesteps", type=int, default=100000) + +if __name__ == "__main__": + args = parser.parse_args() + ray.init() + + register_env("pa_cartpole", lambda _: ParametricActionsCartPoleNoEmbeddings(10)) + + ModelCatalog.register_custom_model( + "pa_model", ParametricActionsModelThatLearnsEmbeddings + ) + + if args.run == "DQN": + cfg = { + # TODO(ekl) we need to set these to prevent the masked values + # from being further processed in DistributionalQModel, which + # would mess up the masking. It is possible to support these if we + # defined a custom DistributionalQModel that is aware of masking. + "hiddens": [], + "dueling": False, + } + else: + cfg = {} + + config = dict( + { + "env": "pa_cartpole", + "model": { + "custom_model": "pa_model", + }, + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), + "num_workers": 0, + "framework": args.framework, + "action_mask_key": "valid_avail_actions_mask", + }, + **cfg + ) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.Tuner( + args.run, + run_config=air.RunConfig(stop=stop, verbose=2), + param_space=config, + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py new file mode 100644 index 0000000000000..811f00e9034ba --- /dev/null +++ b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py @@ -0,0 +1,147 @@ +""" +This script demonstrates how to specify custom env APIs in +combination with RLlib's `remote_worker_envs` setting, which +parallelizes individual sub-envs within a vector env by making each +one a Ray Actor. + +Access your Env's API with a custom callback as shown below. +""" +import argparse +import gymnasium as gym +import os + +import ray +from ray import air, tune +from ray.rllib.algorithms.callbacks import DefaultCallbacks +from ray.rllib.env.apis.task_settable_env import TaskSettableEnv +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import get_trainable_cls + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument("--num-workers", type=int, default=1) + +# This should be >1, otherwise, remote envs make no sense. +parser.add_argument("--num-envs-per-worker", type=int, default=4) + +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=50, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=180.0, help="Reward at which we stop training." +) +parser.add_argument( + "--local-mode", + action="store_true", + help="Init Ray in local mode for easier debugging.", +) + + +class NonVectorizedEnvToBeVectorizedIntoRemoteBaseEnv(TaskSettableEnv): + """Class for a single sub-env to be vectorized into RemoteBaseEnv. + + If you specify this class directly under the "env" config key, RLlib + auto-wraps. + + Note that you can implement your own custom APIs. The following demonstrates + using RLlib's TaskSettableEnv API, which is a simple sub-class + of gym.Env. + """ + + def __init__(self, config=None): + super().__init__() + self.action_space = gym.spaces.Box(0, 1, shape=(1,)) + self.observation_space = gym.spaces.Box(0, 1, shape=(2,)) + self.task = 1 + + def reset(self, *, seed=None, options=None): + self.steps = 0 + return self.observation_space.sample(), {} + + def step(self, action): + self.steps += 1 + done = truncated = self.steps > 10 + return self.observation_space.sample(), 0, done, truncated, {} + + def set_task(self, task) -> None: + """We can set the task of each sub-env (ray actor)""" + print("Task set to {}".format(task)) + self.task = task + + +class TaskSettingCallback(DefaultCallbacks): + """Custom callback to verify, we can set the task on each remote sub-env.""" + + def on_train_result(self, *, algorithm, result: dict, **kwargs) -> None: + """Curriculum learning as seen in Ray docs""" + if result["episode_reward_mean"] > 0.0: + phase = 0 + else: + phase = 1 + + # Sub-envs are now ray.actor.ActorHandles, so we have to add + # `remote()` here. + algorithm.workers.foreach_env(lambda env: env.set_task.remote(phase)) + + +if __name__ == "__main__": + args = parser.parse_args() + ray.init(num_cpus=6, local_mode=args.local_mode) + + config = ( + get_trainable_cls(args.run) + .get_default_config() + # Specify your custom (single, non-vectorized) env directly as a + # class. This way, RLlib can auto-create Actors from this class + # and handle everything correctly. + .environment(NonVectorizedEnvToBeVectorizedIntoRemoteBaseEnv) + .framework(args.framework) + # Set up our own callbacks. + .callbacks(TaskSettingCallback) + .rollouts( + # Force sub-envs to be ray.actor.ActorHandles, so we can step + # through them in parallel. + remote_worker_envs=True, + # How many RolloutWorkers (each with n environment copies: + # `num_envs_per_worker`)? + num_rollout_workers=args.num_workers, + # This setting should not really matter as it does not affect the + # number of GPUs reserved for each worker. + num_envs_per_worker=args.num_envs_per_worker, + ) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.Tuner( + args.run, + param_space=config, + run_config=air.RunConfig(stop=stop, verbose=1), + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py new file mode 100644 index 0000000000000..25a0bbeb85eae --- /dev/null +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -0,0 +1,179 @@ +""" +This script demonstrates how to specify n (vectorized) envs +as Ray remote (actors), such that stepping through these occurs in parallel. +Also, actions for each env step are calculated on the "main" node. + +This behavior can be useful if the "main" node is a GPU machine and you would like to +speed up batched action calculations, similar to DeepMind's SEED +architecture, described here: + +https://ai.googleblog.com/2020/03/massively-scaling-reinforcement.html +""" +import argparse +import os +from typing import Union + +import ray +from ray import air, tune +from ray.rllib.algorithms.ppo import PPO, PPOConfig +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.utils.annotations import override +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.rllib.utils.typing import PartialAlgorithmConfigDict +from ray.tune import PlacementGroupFactory +from ray.tune.logger import pretty_print + + +def get_cli_args(): + """Create CLI parser and return parsed arguments""" + parser = argparse.ArgumentParser() + + # example-specific args + # This should be >1, otherwise, remote envs make no sense. + parser.add_argument("--num-envs-per-worker", type=int, default=4) + + # general args + parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", + ) + parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", + ) + parser.add_argument( + "--stop-iters", type=int, default=50, help="Number of iterations to train." + ) + parser.add_argument( + "--stop-timesteps", + type=int, + default=100000, + help="Number of timesteps to train.", + ) + parser.add_argument( + "--stop-reward", + type=float, + default=150.0, + help="Reward at which we stop training.", + ) + parser.add_argument( + "--no-tune", + action="store_true", + help="Run without Tune using a manual train loop instead. Here," + "there is no TensorBoard support.", + ) + parser.add_argument( + "--local-mode", + action="store_true", + help="Init Ray in local mode for easier debugging.", + ) + + args = parser.parse_args() + print(f"Running with following CLI args: {args}") + return args + + +# The modified Algorithm class we use: +# Subclassing from PPO, our algo only modifies `default_resource_request`, +# telling Ray Tune that it's ok (not mandatory) to place our n remote envs on a +# different node (each env using 1 CPU). +class PPORemoteInference(PPO): + @classmethod + @override(Algorithm) + def default_resource_request( + cls, + config: Union[AlgorithmConfig, PartialAlgorithmConfigDict], + ): + if isinstance(config, AlgorithmConfig): + cf = config + else: + cf = cls.get_default_config().update_from_dict(config) + + # Return PlacementGroupFactory containing all needed resources + # (already properly defined as device bundles). + return PlacementGroupFactory( + bundles=[ + { + # Single CPU for the local worker. This CPU hosts the + # main model in this example (num_workers=0). + "CPU": 1, + # Possibly add n GPUs to this. + "GPU": cf.num_gpus, + }, + { + # Different bundle (meaning: possibly different node) + # for your n "remote" envs (set remote_worker_envs=True). + "CPU": cf.num_envs_per_worker, + }, + ], + strategy=cf.placement_strategy, + ) + + +if __name__ == "__main__": + args = get_cli_args() + + ray.init(num_cpus=6, local_mode=args.local_mode) + + config = ( + PPOConfig() + .environment("CartPole-v1") + .framework(args.framework) + .rollouts( + # Force sub-envs to be ray.actor.ActorHandles, so we can step + # through them in parallel. + remote_worker_envs=True, + num_envs_per_worker=args.num_envs_per_worker, + # Use a single worker (however, with n parallelized remote envs, maybe + # even running on another node). + # Action computations occur on the "main" (GPU?) node, while + # the envs run on one or more CPU node(s). + num_rollout_workers=0, + ) + .resources( + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), + # Set the number of CPUs used by the (local) worker, aka "driver" + # to match the number of Ray remote envs. + num_cpus_for_local_worker=args.num_envs_per_worker + 1, + ) + ) + + # Run as manual training loop. + if args.no_tune: + # manual training loop using PPO and manually keeping track of state + algo = PPORemoteInference(config=config) + # run manual training loop and print results after each iteration + for _ in range(args.stop_iters): + result = algo.train() + print(pretty_print(result)) + # Stop training if the target train steps or reward are reached. + if ( + result["timesteps_total"] >= args.stop_timesteps + or result["episode_reward_mean"] >= args.stop_reward + ): + break + + # Run with Tune for auto env and algorithm creation and TensorBoard. + else: + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.Tuner( + PPORemoteInference, + param_space=config, + run_config=air.RunConfig(stop=stop, verbose=1), + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py b/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py new file mode 100644 index 0000000000000..7e308ceca30fc --- /dev/null +++ b/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py @@ -0,0 +1,50 @@ +""" +Example script on how to train, save, load, and test an RLlib agent. +Equivalent script with stable baselines: sb2rllib_sb_example.py. +Demonstrates transition from stable_baselines to Ray RLlib. + +Run example: python sb2rllib_rllib_example.py +""" +import gymnasium as gym +from ray import tune, air +import ray.rllib.algorithms.ppo as ppo + +# settings used for both stable baselines and rllib +env_name = "CartPole-v1" +train_steps = 10000 +learning_rate = 1e-3 +save_dir = "saved_models" + +# training and saving +analysis = tune.Tuner( + "PPO", + run_config=air.RunConfig( + stop={"timesteps_total": train_steps}, + local_dir=save_dir, + checkpoint_config=air.CheckpointConfig( + checkpoint_at_end=True, + ), + ), + param_space={"env": env_name, "lr": learning_rate}, +).fit() +# retrieve the checkpoint path +analysis.default_metric = "episode_reward_mean" +analysis.default_mode = "max" +checkpoint_path = analysis.get_best_checkpoint(trial=analysis.get_best_trial()) +print(f"Trained model saved at {checkpoint_path}") + +# load and restore model +agent = ppo.PPO(env=env_name) +agent.restore(checkpoint_path) +print(f"Agent loaded from saved model at {checkpoint_path}") + +# inference +env = gym.make(env_name) +obs, info = env.reset() +for i in range(1000): + action = agent.compute_single_action(obs) + obs, reward, terminated, truncated, info = env.step(action) + env.render() + if terminated or truncated: + print(f"Cart pole ended after {i} steps.") + break diff --git a/rllib/examples/_old_api_stack/sb2rllib_sb_example.py b/rllib/examples/_old_api_stack/sb2rllib_sb_example.py new file mode 100644 index 0000000000000..3812fea5420a6 --- /dev/null +++ b/rllib/examples/_old_api_stack/sb2rllib_sb_example.py @@ -0,0 +1,40 @@ +""" +Example script on how to train, save, load, and test a stable baselines 2 agent. +Code taken and adjusted from SB2 docs: +https://stable-baselines.readthedocs.io/en/master/guide/quickstart.html +Equivalent script with RLlib: sb2rllib_rllib_example.py +""" +import gymnasium as gym + +from stable_baselines.common.policies import MlpPolicy +from stable_baselines import PPO2 + +# settings used for both stable baselines and rllib +env_name = "CartPole-v1" +train_steps = 10000 +learning_rate = 1e-3 +save_dir = "saved_models" + +save_path = f"{save_dir}/sb_model_{train_steps}steps" +env = gym.make(env_name) + +# training and saving +model = PPO2(MlpPolicy, env, learning_rate=learning_rate, verbose=1) +model.learn(total_timesteps=train_steps) +model.save(save_path) +print(f"Trained model saved at {save_path}") + +# delete and load model (just for illustration) +del model +model = PPO2.load(save_path) +print(f"Agent loaded from saved model at {save_path}") + +# inference +obs, info = env.reset() +for i in range(1000): + action, _states = model.predict(obs) + obs, reward, terminated, truncated, info = env.step(action) + env.render() + if terminated or truncated: + print(f"Cart pole ended after {i} steps.") + break diff --git a/rllib/examples/_old_api_stack/two_trainer_workflow.py b/rllib/examples/_old_api_stack/two_trainer_workflow.py new file mode 100644 index 0000000000000..c04bc2e637c45 --- /dev/null +++ b/rllib/examples/_old_api_stack/two_trainer_workflow.py @@ -0,0 +1,219 @@ +"""Example of using a custom training workflow. + +This example creates a number of CartPole agents, some of which are trained with +DQN, and some of which are trained with PPO. Both are executed concurrently +with a custom training workflow. +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.algorithms.dqn.dqn import DQNConfig +from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy +from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy +from ray.rllib.algorithms.ppo.ppo import PPOConfig +from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy +from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.execution.rollout_ops import synchronous_parallel_sample +from ray.rllib.execution.train_ops import train_one_step +from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ( + MultiAgentReplayBuffer, +) +from ray.rllib.examples.env.multi_agent import MultiAgentCartPole +from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import ( + NUM_AGENT_STEPS_SAMPLED, + NUM_ENV_STEPS_SAMPLED, + NUM_TARGET_UPDATES, + LAST_TARGET_UPDATE_TS, +) +from ray.rllib.utils.sgd import standardized +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.rllib.utils.typing import ResultDict +from ray.tune.registry import register_env + +parser = argparse.ArgumentParser() +parser.add_argument("--torch", action="store_true") +parser.add_argument("--mixed-torch-tf", action="store_true") +parser.add_argument( + "--local-mode", + action="store_true", + help="Init Ray in local mode for easier debugging.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=600, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train." +) +# 600.0 = 4 (num_agents) x 150.0 +parser.add_argument( + "--stop-reward", type=float, default=600.0, help="Reward at which we stop training." +) + + +# Define new Algorithm with custom `training_step()` method (training workflow). +class MyAlgo(Algorithm): + @override(Algorithm) + def setup(self, config): + # Call super's `setup` to create rollout workers. + super().setup(config) + # Create local replay buffer. + self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, capacity=50000) + + @override(Algorithm) + def training_step(self) -> ResultDict: + # Generate common experiences, collect batch for PPO, store every (DQN) batch + # into replay buffer. + ppo_batches = [] + num_env_steps = 0 + + # PPO batch size fixed at 200. + # TODO: Use `max_env_steps=200` option of synchronous_parallel_sample instead. + while num_env_steps < 200: + ma_batches = synchronous_parallel_sample( + worker_set=self.workers, concat=False + ) + # Loop through ma-batches (which were collected in parallel). + for ma_batch in ma_batches: + # Update sampled counters. + self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count + self._counters[NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps() + ppo_batch = ma_batch.policy_batches.pop("ppo_policy") + # Add collected batches (only for DQN policy) to replay buffer. + self.local_replay_buffer.add(ma_batch) + + ppo_batches.append(ppo_batch) + num_env_steps += ppo_batch.count + + # DQN sub-flow. + dqn_train_results = {} + # Start updating DQN policy once we have some samples in the buffer. + if self._counters[NUM_ENV_STEPS_SAMPLED] > 1000: + # Update DQN policy n times while updating PPO policy once. + for _ in range(10): + dqn_train_batch = self.local_replay_buffer.sample(num_items=64) + dqn_train_results = train_one_step( + self, dqn_train_batch, ["dqn_policy"] + ) + self._counters[ + "agent_steps_trained_DQN" + ] += dqn_train_batch.agent_steps() + print( + "DQN policy learning on samples from", + "agent steps trained", + dqn_train_batch.agent_steps(), + ) + # Update DQN's target net every n train steps (determined by the DQN config). + if ( + self._counters["agent_steps_trained_DQN"] + - self._counters[LAST_TARGET_UPDATE_TS] + >= self.get_policy("dqn_policy").config["target_network_update_freq"] + ): + self.workers.local_worker().get_policy("dqn_policy").update_target() + self._counters[NUM_TARGET_UPDATES] += 1 + self._counters[LAST_TARGET_UPDATE_TS] = self._counters[ + "agent_steps_trained_DQN" + ] + + # PPO sub-flow. + ppo_train_batch = concat_samples(ppo_batches) + self._counters["agent_steps_trained_PPO"] += ppo_train_batch.agent_steps() + # Standardize advantages. + ppo_train_batch[Postprocessing.ADVANTAGES] = standardized( + ppo_train_batch[Postprocessing.ADVANTAGES] + ) + print( + "PPO policy learning on samples from", + "agent steps trained", + ppo_train_batch.agent_steps(), + ) + ppo_train_batch = MultiAgentBatch( + {"ppo_policy": ppo_train_batch}, ppo_train_batch.count + ) + ppo_train_results = train_one_step(self, ppo_train_batch, ["ppo_policy"]) + + # Combine results for PPO and DQN into one results dict. + results = dict(ppo_train_results, **dqn_train_results) + return results + + +if __name__ == "__main__": + args = parser.parse_args() + assert not ( + args.torch and args.mixed_torch_tf + ), "Use either --torch or --mixed-torch-tf, not both!" + + ray.init(local_mode=args.local_mode) + + # Simple environment with 4 independent cartpole entities + register_env( + "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4}) + ) + + # Note that since the algorithm below does not include a default policy or + # policy configs, we have to explicitly set it in the multiagent config: + policies = { + "ppo_policy": ( + PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy, + None, + None, + # Provide entire AlgorithmConfig object, not just an override. + PPOConfig() + .training(num_sgd_iter=10, sgd_minibatch_size=128) + .framework("torch" if args.torch or args.mixed_torch_tf else "tf"), + ), + "dqn_policy": ( + DQNTorchPolicy if args.torch else DQNTFPolicy, + None, + None, + # Provide entire AlgorithmConfig object, not just an override. + DQNConfig().training(target_network_update_freq=500).framework("tf"), + ), + } + + def policy_mapping_fn(agent_id, episode, worker, **kwargs): + if agent_id % 2 == 0: + return "ppo_policy" + else: + return "dqn_policy" + + config = ( + AlgorithmConfig() + # TODO (Kourosh): Migrate this to the new RLModule / Learner API. + .experimental(_enable_new_api_stack=False) + .environment("multi_agent_cartpole") + .framework("torch" if args.torch else "tf") + .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) + .rollouts(num_rollout_workers=0, rollout_fragment_length=50) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + .reporting(metrics_num_episodes_for_smoothing=30) + ) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.Tuner( + MyAlgo, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/attention_net_supervised.py b/rllib/examples/attention_net_supervised.py index d5615f8f042fb..c2895ed821f30 100644 --- a/rllib/examples/attention_net_supervised.py +++ b/rllib/examples/attention_net_supervised.py @@ -1,76 +1,6 @@ -from gymnasium.spaces import Box, Discrete -import numpy as np +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.attention_net_supervised.py` +""" -from rllib.models.tf.attention_net import TrXLNet -from ray.rllib.utils.framework import try_import_tf - -tf1, tf, tfv = try_import_tf() - - -def bit_shift_generator(seq_length, shift, batch_size): - while True: - values = np.array([0.0, 1.0], dtype=np.float32) - seq = np.random.choice(values, (batch_size, seq_length, 1)) - targets = np.squeeze(np.roll(seq, shift, axis=1).astype(np.int32)) - targets[:, :shift] = 0 - yield seq, targets - - -def train_loss(targets, outputs): - loss = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=targets, logits=outputs - ) - return tf.reduce_mean(loss) - - -def train_bit_shift(seq_length, num_iterations, print_every_n): - - optimizer = tf.keras.optimizers.Adam(1e-3) - - model = TrXLNet( - observation_space=Box(low=0, high=1, shape=(1,), dtype=np.int32), - action_space=Discrete(2), - num_outputs=2, - model_config={"max_seq_len": seq_length}, - name="trxl", - num_transformer_units=1, - attention_dim=10, - num_heads=5, - head_dim=20, - position_wise_mlp_dim=20, - ) - - shift = 10 - train_batch = 10 - test_batch = 100 - data_gen = bit_shift_generator(seq_length, shift=shift, batch_size=train_batch) - test_gen = bit_shift_generator(seq_length, shift=shift, batch_size=test_batch) - - @tf.function - def update_step(inputs, targets): - model_out = model( - {"obs": inputs}, - state=[tf.reshape(inputs, [-1, seq_length, 1])], - seq_lens=np.full(shape=(train_batch,), fill_value=seq_length), - ) - optimizer.minimize( - lambda: train_loss(targets, model_out), lambda: model.trainable_variables - ) - - for i, (inputs, targets) in zip(range(num_iterations), data_gen): - inputs_in = np.reshape(inputs, [-1, 1]) - targets_in = np.reshape(targets, [-1]) - update_step(tf.convert_to_tensor(inputs_in), tf.convert_to_tensor(targets_in)) - - if i % print_every_n == 0: - test_inputs, test_targets = next(test_gen) - print(i, train_loss(test_targets, model(test_inputs))) - - -if __name__ == "__main__": - tf.enable_eager_execution() - train_bit_shift( - seq_length=20, - num_iterations=2000, - print_every_n=200, - ) +raise NotImplementedError(msg) diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 0637611eeaf53..3077efedef480 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -1,3 +1,8 @@ +# *********************************************************************************** +# IMPORTANT NOTE: This script is using the old API stack and will soon be replaced by +# `ray.rllib.examples.multi_agent_and_self_play.pettingzoo_shared_value_function.py`! +# *********************************************************************************** + """An example of customizing PPO to leverage a centralized critic. Here the model and policy are hard-coded to implement a centralized critic @@ -14,8 +19,8 @@ """ import argparse -import numpy as np from gymnasium.spaces import Discrete +import numpy as np import os import ray diff --git a/rllib/examples/centralized_critic_2.py b/rllib/examples/centralized_critic_2.py index adfb206ff81f7..5638b8a0179eb 100644 --- a/rllib/examples/centralized_critic_2.py +++ b/rllib/examples/centralized_critic_2.py @@ -1,3 +1,9 @@ +# *********************************************************************************** +# IMPORTANT NOTE: This script is using the old API stack and will soon be replaced by +# `ray.rllib.examples.multi_agent_and_self_play.pettingzoo_shared_value_function.py`! +# *********************************************************************************** + + """An example of implementing a centralized critic with ObservationFunction. The advantage of this approach is that it's very simple and you don't have to diff --git a/rllib/examples/complex_struct_space.py b/rllib/examples/complex_struct_space.py index 40022040cd020..4c2487e8c41c3 100644 --- a/rllib/examples/complex_struct_space.py +++ b/rllib/examples/complex_struct_space.py @@ -1,57 +1,6 @@ -"""Example of using variable-length Repeated / struct observation spaces. - -This example shows: - - using a custom environment with Repeated / struct observations - - using a custom model to view the batched list observations - -For PyTorch / TF eager mode, use the `--framework=[torch|tf2]` flag. +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.complex_struct_space.py` """ -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.models import ModelCatalog -from ray.rllib.examples.env.simple_rpg import SimpleRPG -from ray.rllib.examples.models.simple_rpg_model import ( - CustomTorchRPGModel, - CustomTFRPGModel, -) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="tf2", - help="The DL framework specifier.", -) - -if __name__ == "__main__": - ray.init() - args = parser.parse_args() - if args.framework == "torch": - ModelCatalog.register_custom_model("my_model", CustomTorchRPGModel) - else: - ModelCatalog.register_custom_model("my_model", CustomTFRPGModel) - - config = ( - PPOConfig() - .environment(SimpleRPG) - .framework(args.framework) - .rollouts(rollout_fragment_length=1, num_rollout_workers=0) - .training(train_batch_size=2, model={"custom_model": "my_model"}) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - "timesteps_total": 1, - } - - tuner = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=1), - ) +raise NotImplementedError(msg) diff --git a/rllib/examples/connectors/connector_v2_frame_stacking.py b/rllib/examples/connectors/connector_v2_frame_stacking.py index a57b5343f6b3a..2b49b3eda79b3 100644 --- a/rllib/examples/connectors/connector_v2_frame_stacking.py +++ b/rllib/examples/connectors/connector_v2_frame_stacking.py @@ -2,8 +2,6 @@ from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack from ray.rllib.examples.env.multi_agent import make_multi_agent from ray.rllib.utils.test_utils import ( @@ -87,12 +85,9 @@ def _env_creator(cfg): else: tune.register_env("env", _env_creator) - config = ( + base_config = ( get_trainable_cls(args.algo) .get_default_config() - # Use new API stack ... - .experimental(_enable_new_api_stack=args.enable_new_api_stack) - .framework(args.framework) .environment( "env", env_config={ @@ -110,23 +105,7 @@ def _env_creator(cfg): if args.use_gym_wrapper_framestacking else _make_env_to_module_connector ), - num_rollout_workers=args.num_env_runners, num_envs_per_worker=1 if args.num_agents > 0 else 2, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ), - ) - .resources( - num_gpus=args.num_gpus, # old stack - num_learner_workers=args.num_gpus, # new stack - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, ) .training( # Use our frame stacking learner connector. @@ -157,10 +136,10 @@ def _env_creator(cfg): # Add a simple multi-agent setup. if args.num_agents > 0: - config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) # Run everything as configured. - run_rllib_example_script_experiment(config, args) + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/connector_v2_mean_std_filtering.py b/rllib/examples/connectors/connector_v2_mean_std_filtering.py index c657500ca82a0..db80be7ec72a8 100644 --- a/rllib/examples/connectors/connector_v2_mean_std_filtering.py +++ b/rllib/examples/connectors/connector_v2_mean_std_filtering.py @@ -1,6 +1,4 @@ from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.examples.env.multi_agent import MultiAgentPendulum from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.test_utils import ( @@ -28,23 +26,11 @@ lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), ) - config = ( + base_config = ( get_trainable_cls(args.algo) .get_default_config() - .framework(args.framework) .environment("env" if args.num_agents > 0 else "Pendulum-v1") - .experimental(_enable_new_api_stack=args.enable_new_api_stack) .rollouts( - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ), - num_rollout_workers=args.num_env_runners, # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's # MultiAgentEnv API. @@ -58,11 +44,6 @@ lambda env: MeanStdFilter(multi_agent=args.num_agents > 0) ), ) - .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, - ) .training( train_batch_size_per_learner=512, mini_batch_size_per_learner=64, @@ -94,9 +75,9 @@ # Add a simple multi-agent setup. if args.num_agents > 0: - config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) - run_rllib_example_script_experiment(config, args) + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/connector_v2_nested_action_spaces.py b/rllib/examples/connectors/connector_v2_nested_action_spaces.py new file mode 100644 index 0000000000000..0224ac9583fd3 --- /dev/null +++ b/rllib/examples/connectors/connector_v2_nested_action_spaces.py @@ -0,0 +1,94 @@ +from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete + +from ray.tune.registry import register_env +from ray.rllib.connectors.env_to_module import ( + AddObservationsFromEpisodesToBatch, + FlattenObservations, + WriteObservationsToEpisodes, +) +from ray.rllib.examples.env.multi_agent import MultiAgentNestedSpaceRepeatAfterMeEnv +from ray.rllib.examples.env.nested_space_repeat_after_me_env import ( + NestedSpaceRepeatAfterMeEnv, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Define env-to-module-connector pipeline for the new stack. + def _env_to_module_pipeline(env): + return [ + AddObservationsFromEpisodesToBatch(), + FlattenObservations(multi_agent=args.num_agents > 0), + WriteObservationsToEpisodes(), + ] + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv( + config=dict(c, **{"num_agents": args.num_agents}) + ), + ) + else: + register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c)) + + # Define the AlgorithmConfig used. + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + "env", + env_config={ + "space": Dict( + { + "a": Tuple( + [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})] + ), + "b": Box(-10.0, 10.0, (2,)), + "c": MultiDiscrete([3, 3]), + "d": Discrete(2), + } + ), + "episode_len": 100, + }, + ) + .rollouts(env_to_module_connector=_env_to_module_pipeline) + # No history in Env (bandit problem). + .training( + gamma=0.0, + lr=0.0005, + model=( + {} if not args.enable_new_api_stack else {"uses_new_env_runners": True} + ), + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Fix some PPO-specific settings. + if args.algo == "PPO": + base_config.training( + # We don't want high entropy in this Env. + entropy_coeff=0.00005, + num_sgd_iter=4, + vf_loss_coeff=0.01, + ) + + # Run everything as configured. + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/connector_v2_nested_observation_spaces.py b/rllib/examples/connectors/connector_v2_nested_observation_spaces.py index c52e41bc26435..71578ed3323d2 100644 --- a/rllib/examples/connectors/connector_v2_nested_observation_spaces.py +++ b/rllib/examples/connectors/connector_v2_nested_observation_spaces.py @@ -4,8 +4,6 @@ FlattenObservations, WriteObservationsToEpisodes, ) -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.examples.env.cartpole_with_dict_observation_space import ( CartPoleWithDictObservationSpace, ) @@ -46,32 +44,11 @@ def _env_to_module_pipeline(env): register_env("env", lambda _: CartPoleWithDictObservationSpace()) # Define the AlgorithmConfig used. - config = ( + base_config = ( get_trainable_cls(args.algo) .get_default_config() - # Use new API stack for PPO only. - .experimental(_enable_new_api_stack=args.enable_new_api_stack) .environment("env") - .framework(args.framework) - .resources( - num_gpus=args.num_gpus, # old stack - num_learner_workers=args.num_gpus, # new stack - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, - ) - .rollouts( - env_to_module_connector=_env_to_module_pipeline, - num_rollout_workers=args.num_env_runners, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ), - ) + .rollouts(env_to_module_connector=_env_to_module_pipeline) .training( gamma=0.99, lr=0.0003, @@ -92,17 +69,17 @@ def _env_to_module_pipeline(env): # Add a simple multi-agent setup. if args.num_agents > 0: - config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) # Fix some PPO-specific settings. if args.algo == "PPO": - config.training( + base_config.training( num_sgd_iter=6, vf_loss_coeff=0.01, ) # Run everything as configured. - run_rllib_example_script_experiment(config, args) + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/connector_v2_prev_actions_prev_rewards.py b/rllib/examples/connectors/connector_v2_prev_actions_prev_rewards.py index 6bb024f713a83..a340418d88bde 100644 --- a/rllib/examples/connectors/connector_v2_prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/connector_v2_prev_actions_prev_rewards.py @@ -7,8 +7,6 @@ PrevActionsPrevRewardsConnector, WriteObservationsToEpisodes, ) -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole from ray.rllib.examples.env.multi_agent import MultiAgentStatelessCartPole from ray.rllib.utils.framework import try_import_torch @@ -56,30 +54,10 @@ def _env_to_module(env): else: register_env("env", lambda _: StatelessCartPole()) - config = ( + base_config = ( PPOConfig() - # Use new API stack. - .experimental(_enable_new_api_stack=args.enable_new_api_stack) .environment("env") - # And new EnvRunner. - .rollouts( - env_to_module_connector=_env_to_module, - num_rollout_workers=args.num_env_runners, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ), - ) - .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, - ) + .rollouts(env_to_module_connector=_env_to_module) .training( num_sgd_iter=6, lr=0.0003, @@ -106,9 +84,9 @@ def _env_to_module(env): # Add a simple multi-agent setup. if args.num_agents > 0: - config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) - run_rllib_example_script_experiment(config, args) + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/custom_eval.py b/rllib/examples/custom_eval.py index 0b64343d7cf1e..6b726663bf52a 100644 --- a/rllib/examples/custom_eval.py +++ b/rllib/examples/custom_eval.py @@ -1,205 +1,6 @@ -"""Example of customizing evaluation with RLlib. - -Pass --custom-eval to run with a custom evaluation function too. - -Here we define a custom evaluation method that runs a specific sweep of env -parameters (SimpleCorridor corridor lengths). - ------------------------------------------------------------------------- -Sample output for `python custom_eval.py` ------------------------------------------------------------------------- - -INFO algorithm.py:623 -- Evaluating current policy for 10 episodes. -INFO algorithm.py:650 -- Running round 0 of parallel evaluation (2/10 episodes) -INFO algorithm.py:650 -- Running round 1 of parallel evaluation (4/10 episodes) -INFO algorithm.py:650 -- Running round 2 of parallel evaluation (6/10 episodes) -INFO algorithm.py:650 -- Running round 3 of parallel evaluation (8/10 episodes) -INFO algorithm.py:650 -- Running round 4 of parallel evaluation (10/10 episodes) - -Result for PG_SimpleCorridor_2c6b27dc: - ... - evaluation: - custom_metrics: {} - episode_len_mean: 15.864661654135338 - episode_reward_max: 1.0 - episode_reward_mean: 0.49624060150375937 - episode_reward_min: 0.0 - episodes_this_iter: 133 - off_policy_estimator: {} - policy_reward_max: {} - policy_reward_mean: {} - policy_reward_min: {} - sampler_perf: - mean_env_wait_ms: 0.0362923321333299 - mean_inference_ms: 0.6319202064080927 - mean_processing_ms: 0.14143652169068222 - ------------------------------------------------------------------------- -Sample output for `python custom_eval.py --custom-eval` ------------------------------------------------------------------------- - -INFO algorithm.py:631 -- Running custom eval function -Update corridor length to 4 -Update corridor length to 7 -Custom evaluation round 1 -Custom evaluation round 2 -Custom evaluation round 3 -Custom evaluation round 4 - -Result for PG_SimpleCorridor_0de4e686: - ... - evaluation: - custom_metrics: {} - episode_len_mean: 9.15695067264574 - episode_reward_max: 1.0 - episode_reward_mean: 0.9596412556053812 - episode_reward_min: 0.0 - episodes_this_iter: 223 - foo: 1 - off_policy_estimator: {} - policy_reward_max: {} - policy_reward_mean: {} - policy_reward_min: {} - sampler_perf: - mean_env_wait_ms: 0.03423667269562796 - mean_inference_ms: 0.5654563161491506 - mean_processing_ms: 0.14494765630060774 +msg = """ +This script has been moved to +`ray.rllib.examples.evaluation.custom_evaluation.py` """ -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes -from ray.rllib.examples.env.simple_corridor import SimpleCorridor -from ray.rllib.utils.test_utils import check_learning_achieved - -parser = argparse.ArgumentParser() -parser.add_argument("--evaluation-parallel-to-training", action="store_true") -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument("--no-custom-eval", action="store_true") -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=20000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=0.7, help="Reward at which we stop training." -) -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) - - -def custom_eval_function(algorithm, eval_workers): - """Example of a custom evaluation function. - - Args: - algorithm: Algorithm class to evaluate. - eval_workers: Evaluation WorkerSet. - - Returns: - metrics: Evaluation metrics dict. - """ - # Set different env settings for each worker. Here we use the worker's - # `worker_index` property. - eval_workers.foreach_worker( - func=lambda w: w.foreach_env( - lambda env: env.set_corridor_length(4 if w.worker_index == 1 else 7) - ) - ) - - for i in range(5): - print("Custom evaluation round", i) - # Calling .sample() runs exactly one episode per worker due to how the - # eval workers are configured. - eval_workers.foreach_worker(func=lambda w: w.sample(), local_worker=False) - - # Collect the accumulated episodes on the workers, and then summarize the - # episode stats into a metrics dict. - episodes = collect_episodes(workers=eval_workers, timeout_seconds=99999) - # You can compute metrics from the episodes manually, or use the - # convenient `summarize_episodes()` utility: - metrics = summarize_episodes(episodes) - - # You can also put custom values in the metrics dict. - metrics["foo"] = 1 - return metrics - - -if __name__ == "__main__": - args = parser.parse_args() - - if args.no_custom_eval: - eval_fn = None - else: - eval_fn = custom_eval_function - - ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) - - config = ( - PPOConfig() - .environment(SimpleCorridor, env_config={"corridor_length": 10}) - # Training rollouts will be collected using just the learner - # process, but evaluation will be done in parallel with two - # workers. Hence, this run will use 3 CPUs total (1 for the - # learner + 2 more for evaluation workers). - .rollouts(num_rollout_workers=0) - .evaluation( - evaluation_num_workers=2, - # Enable evaluation, once per training iteration. - evaluation_interval=1, - # Run 10 episodes each time evaluation runs (OR "auto" if parallel to - # training). - evaluation_duration="auto" if args.evaluation_parallel_to_training else 10, - # Evaluate parallelly to training. - evaluation_parallel_to_training=args.evaluation_parallel_to_training, - evaluation_config=PPOConfig.overrides( - env_config={ - # Evaluate using LONGER corridor than trained on. - "corridor_length": 5, - }, - ), - custom_evaluation_function=eval_fn, - ) - .framework(args.framework) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - tuner = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=1), - ) - results = tuner.fit() - - # Check eval results (from eval workers using the custom function), - # not results from the regular workers. - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/custom_keras_model.py b/rllib/examples/custom_keras_model.py index 9f24742fe2f8b..89a09ef15e8fe 100644 --- a/rllib/examples/custom_keras_model.py +++ b/rllib/examples/custom_keras_model.py @@ -1,153 +1,6 @@ -"""Example of using a custom ModelV2 Keras-style model.""" +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.custom_keras_model.py` +""" -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.algorithms.dqn.dqn import DQNConfig -from ray.rllib.algorithms.dqn.distributional_q_tf_model import DistributionalQTFModel -from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.misc import normc_initializer -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.tf.visionnet import VisionNetwork as MyVisionNetwork -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY -from ray.tune.registry import get_trainable_cls - -tf1, tf, tfv = try_import_tf() - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="DQN", help="The RLlib-registered algorithm to use." -) -parser.add_argument("--stop", type=int, default=200) -parser.add_argument("--use-vision-network", action="store_true") -parser.add_argument("--num-cpus", type=int, default=0) - - -class MyKerasModel(TFModelV2): - """Custom model for policy gradient algorithms.""" - - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - super(MyKerasModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name - ) - self.inputs = tf.keras.layers.Input(shape=obs_space.shape, name="observations") - layer_1 = tf.keras.layers.Dense( - 128, - name="my_layer1", - activation=tf.nn.relu, - kernel_initializer=normc_initializer(1.0), - )(self.inputs) - layer_out = tf.keras.layers.Dense( - num_outputs, - name="my_out", - activation=None, - kernel_initializer=normc_initializer(0.01), - )(layer_1) - value_out = tf.keras.layers.Dense( - 1, - name="value_out", - activation=None, - kernel_initializer=normc_initializer(0.01), - )(layer_1) - self.base_model = tf.keras.Model(self.inputs, [layer_out, value_out]) - - def forward(self, input_dict, state, seq_lens): - model_out, self._value_out = self.base_model(input_dict["obs"]) - return model_out, state - - def value_function(self): - return tf.reshape(self._value_out, [-1]) - - def metrics(self): - return {"foo": tf.constant(42.0)} - - -class MyKerasQModel(DistributionalQTFModel): - """Custom model for DQN.""" - - def __init__(self, obs_space, action_space, num_outputs, model_config, name, **kw): - super(MyKerasQModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name, **kw - ) - - # Define the core model layers which will be used by the other - # output heads of DistributionalQModel - self.inputs = tf.keras.layers.Input(shape=obs_space.shape, name="observations") - layer_1 = tf.keras.layers.Dense( - 128, - name="my_layer1", - activation=tf.nn.relu, - kernel_initializer=normc_initializer(1.0), - )(self.inputs) - layer_out = tf.keras.layers.Dense( - num_outputs, - name="my_out", - activation=tf.nn.relu, - kernel_initializer=normc_initializer(1.0), - )(layer_1) - self.base_model = tf.keras.Model(self.inputs, layer_out) - - # Implement the core forward method. - def forward(self, input_dict, state, seq_lens): - model_out = self.base_model(input_dict["obs"]) - return model_out, state - - def metrics(self): - return {"foo": tf.constant(42.0)} - - -if __name__ == "__main__": - args = parser.parse_args() - ray.init(num_cpus=args.num_cpus or None) - ModelCatalog.register_custom_model( - "keras_model", MyVisionNetwork if args.use_vision_network else MyKerasModel - ) - ModelCatalog.register_custom_model( - "keras_q_model", MyVisionNetwork if args.use_vision_network else MyKerasQModel - ) - - # Tests https://github.com/ray-project/ray/issues/7293 - class MyCallbacks(DefaultCallbacks): - def on_train_result(self, algorithm, result, **kwargs): - r = result["result"]["info"][LEARNER_INFO] - if DEFAULT_POLICY_ID in r: - r = r[DEFAULT_POLICY_ID].get(LEARNER_STATS_KEY, r[DEFAULT_POLICY_ID]) - assert r["model"]["foo"] == 42, result - - config = ( - get_trainable_cls(args.run) - .get_default_config() - .environment("ALE/Breakout-v5" if args.use_vision_network else "CartPole-v1") - .framework("tf") - .callbacks(MyCallbacks) - .training( - model={ - "custom_model": "keras_q_model" if args.run == "DQN" else "keras_model" - } - ) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - if args.run == "DQN": - config = ( - DQNConfig() - .update_from_dict(config.to_dict()) - .training(num_steps_sampled_before_learning_starts=0) - ) - - stop = { - "episode_reward_mean": args.stop, - } - - tuner = tune.Tuner( - args.run, - param_space=config, - run_config=air.RunConfig(stop=stop), - ) - tuner.fit() +raise NotImplementedError(msg) diff --git a/rllib/examples/env/simple_corridor.py b/rllib/examples/env/simple_corridor.py index b0a7a8df5285e..9088f73dbd374 100644 --- a/rllib/examples/env/simple_corridor.py +++ b/rllib/examples/env/simple_corridor.py @@ -10,24 +10,33 @@ class SimpleCorridor(gym.Env): def __init__(self, config=None): config = config or {} - self.end_pos = config.get("corridor_length", 10) - self.cur_pos = 0 + self.action_space = Discrete(2) self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32) + self.set_corridor_length(config.get("corridor_length", 10)) + + self._cur_pos = 0 + def set_corridor_length(self, length): self.end_pos = length - print("Updated corridor length to {}".format(length)) + print(f"Set corridor length to {self.end_pos}") + assert self.end_pos <= 999, "The maximum `corridor_length` allowed is 999!" def reset(self, *, seed=None, options=None): - self.cur_pos = 0.0 - return [self.cur_pos], {} + self._cur_pos = 0.0 + return self._get_obs(), {} def step(self, action): assert action in [0, 1], action - if action == 0 and self.cur_pos > 0: - self.cur_pos -= 1.0 + if action == 0 and self._cur_pos > 0: + self._cur_pos -= 1.0 elif action == 1: - self.cur_pos += 1.0 - done = truncated = self.cur_pos >= self.end_pos - return [self.cur_pos], 1 if done else 0, done, truncated, {} + self._cur_pos += 1.0 + terminated = self._cur_pos >= self.end_pos + truncated = False + reward = 1.0 if terminated else -0.01 + return self._get_obs(), reward, terminated, truncated, {} + + def _get_obs(self): + return np.array([self._cur_pos], np.float32) diff --git a/rllib/examples/env/tests/test_cliff_walking_wall_env.py b/rllib/examples/env/tests/test_cliff_walking_wall_env.py deleted file mode 100644 index 59105f4015397..0000000000000 --- a/rllib/examples/env/tests/test_cliff_walking_wall_env.py +++ /dev/null @@ -1,61 +0,0 @@ -from ray.rllib.examples.env.cliff_walking_wall_env import ( - CliffWalkingWallEnv, - ACTION_UP, - ACTION_RIGHT, - ACTION_DOWN, - ACTION_LEFT, -) - -import unittest - - -class TestCliffWalkingWallEnv(unittest.TestCase): - def test_env(self): - env = CliffWalkingWallEnv() - obs, info = env.reset() - # Starting position - self.assertEqual(obs, 36) - # Left, Right, and Down are no-ops - obs, _, _, _, _ = env.step(ACTION_LEFT) - self.assertEqual(obs, 36) - obs, _, _, _, _ = env.step(ACTION_DOWN) - self.assertEqual(obs, 36) - obs, _, _, _, _ = env.step(ACTION_RIGHT) - self.assertEqual(obs, 36) - - # Up and Down returns to starting position - obs, _, _, _, _ = env.step(ACTION_UP) - self.assertEqual(obs, 24) - obs, _, _, _, _ = env.step(ACTION_DOWN) - self.assertEqual(obs, 36) - obs, _, _, _ = env.step(ACTION_DOWN) - self.assertEqual(obs, 36) - - # Going down at the wall is a no-op - env.step(ACTION_UP) - obs, _, _, _ = env.step(ACTION_RIGHT) - self.assertEqual(obs, 25) - obs, _, _, _ = env.step(ACTION_DOWN) - self.assertEqual(obs, 25) - - # Move all the way to the right wall - for _ in range(10): - env.step(ACTION_RIGHT) - obs, rew, done, truncated, _ = env.step(ACTION_RIGHT) - self.assertEqual(obs, 35) - self.assertEqual(rew, -1) - self.assertEqual(done, False) - - # Move to goal - obs, rew, done, truncated, _ = env.step(ACTION_DOWN) - self.assertEqual(obs, 47) - self.assertEqual(rew, 10) - self.assertEqual(done, True) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/examples/env/tests/test_coin_game_non_vectorized_env.py b/rllib/examples/env/tests/test_coin_game_non_vectorized_env.py deleted file mode 100644 index 1305064a381a8..0000000000000 --- a/rllib/examples/env/tests/test_coin_game_non_vectorized_env.py +++ /dev/null @@ -1,882 +0,0 @@ -########## -# Contribution by the Center on Long-Term Risk: -# https://github.com/longtermrisk/marltoolbox -########## -import random - -import numpy as np -from ray.rllib.examples.env.coin_game_non_vectorized_env import CoinGame, AsymCoinGame - -# TODO add tests for grid_size != 3 - - -def test_reset(): - max_steps, grid_size = 20, 3 - envs = init_several_env(max_steps, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - -def init_several_env(max_steps, grid_size, players_can_pick_same_coin=True): - coin_game = init_env( - max_steps, - CoinGame, - grid_size, - players_can_pick_same_coin=players_can_pick_same_coin, - ) - asymm_coin_game = init_env( - max_steps, - AsymCoinGame, - grid_size, - players_can_pick_same_coin=players_can_pick_same_coin, - ) - return [coin_game, asymm_coin_game] - - -def init_env( - max_steps, env_class, seed=None, grid_size=3, players_can_pick_same_coin=True -): - config = { - "max_steps": max_steps, - "grid_size": grid_size, - "both_players_can_pick_the_same_coin": players_can_pick_same_coin, - } - env = env_class(config) - env.seed(seed) - return env - - -def check_obs(obs, grid_size): - assert len(obs) == 2, "two players" - for key, player_obs in obs.items(): - assert player_obs.shape == (grid_size, grid_size, 4) - assert ( - player_obs[..., 0].sum() == 1.0 - ), f"observe 1 player red in grid: {player_obs[..., 0]}" - assert ( - player_obs[..., 1].sum() == 1.0 - ), f"observe 1 player blue in grid: {player_obs[..., 1]}" - assert ( - player_obs[..., 2:].sum() == 1.0 - ), f"observe 1 coin in grid: {player_obs[..., 0]}" - - -def assert_logger_buffer_size(env, n_steps): - assert len(env.red_pick) == n_steps - assert len(env.red_pick_own) == n_steps - assert len(env.blue_pick) == n_steps - assert len(env.blue_pick_own) == n_steps - - -def test_step(): - max_steps, grid_size = 20, 3 - envs = init_several_env(max_steps, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=1) - assert not done["__all__"] - - -def test_multiple_steps(): - max_steps, grid_size = 20, 3 - n_steps = int(max_steps * 0.75) - envs = init_several_env(max_steps, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - for step_i in range(1, n_steps, 1): - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=step_i) - assert not done["__all__"] - - -def test_multiple_episodes(): - max_steps, grid_size = 20, 3 - n_steps = int(max_steps * 8.25) - envs = init_several_env(max_steps, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=step_i) - assert not done["__all__"] or (step_i == max_steps and done["__all__"]) - if done["__all__"]: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - step_i = 0 - - -def overwrite_pos(env, p_red_pos, p_blue_pos, c_red_pos, c_blue_pos): - assert c_red_pos is None or c_blue_pos is None - if c_red_pos is None: - env.red_coin = 0 - coin_pos = c_blue_pos - if c_blue_pos is None: - env.red_coin = 1 - coin_pos = c_red_pos - - env.red_pos = p_red_pos - env.blue_pos = p_blue_pos - env.coin_pos = coin_pos - - env.red_pos = np.array(env.red_pos) - env.blue_pos = np.array(env.blue_pos) - env.coin_pos = np.array(env.coin_pos) - env.red_coin = np.array(env.red_coin) - - -def assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed, - blue_speed, - red_own, - blue_own, -): - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - "player_red": p_red_act[step_i - 1], - "player_blue": p_blue_act[step_i - 1], - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=step_i) - assert not done["__all__"] or (step_i == max_steps and done["__all__"]) - - if done["__all__"]: - assert info["player_red"]["pick_speed"] == red_speed - assert info["player_blue"]["pick_speed"] == blue_speed - - if red_own is None: - assert "pick_own_color" not in info["player_red"] - else: - assert info["player_red"]["pick_own_color"] == red_own - if blue_own is None: - assert "pick_own_color" not in info["player_blue"] - else: - assert info["player_blue"]["pick_own_color"] == blue_own - - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - step_i = 0 - - overwrite_pos( - env, - p_red_pos[step_i], - p_blue_pos[step_i], - c_red_pos[step_i], - c_blue_pos[step_i], - ) - - -def test_logged_info_no_picking(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=0.0, - red_own=None, - blue_own=None, - ) - - envs = init_several_env(max_steps, grid_size, players_can_pick_same_coin=False) - - for env in envs: - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=0.0, - red_own=None, - blue_own=None, - ) - - -def test_logged_info__red_pick_red_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=1.0, - blue_own=None, - ) - - envs = init_several_env(max_steps, grid_size, players_can_pick_same_coin=False) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=1.0, - blue_own=None, - ) - - -def test_logged_info__blue_pick_red_all_the_time(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=1.0, - red_own=None, - blue_own=0.0, - ) - - envs = init_several_env(max_steps, grid_size, players_can_pick_same_coin=False) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=1.0, - red_own=None, - blue_own=0.0, - ) - - -def test_logged_info__blue_pick_blue_all_the_time(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=1.0, - red_own=None, - blue_own=1.0, - ) - - envs = init_several_env(max_steps, grid_size, players_can_pick_same_coin=False) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=1.0, - red_own=None, - blue_own=1.0, - ) - - -def test_logged_info__red_pick_blue_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=0.0, - blue_own=None, - ) - - envs = init_several_env(max_steps, grid_size, players_can_pick_same_coin=False) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=0.0, - blue_own=None, - ) - - -def test_logged_info__both_pick_blue_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=1.0, - red_own=0.0, - blue_own=1.0, - ) - - -def test_logged_info__both_pick_red_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - print( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - ) - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=1.0, - red_own=1.0, - blue_own=0.0, - ) - - -def test_logged_info__both_pick_red_half_the_time(): - p_red_pos = [[0, 0], [0, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.5, - blue_speed=0.5, - red_own=1.0, - blue_own=0.0, - ) - - -def test_logged_info__both_pick_blue_half_the_time(): - p_red_pos = [[0, 0], [0, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.5, - blue_speed=0.5, - red_own=0.0, - blue_own=1.0, - ) - - -def test_logged_info__both_pick_blue(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.25, - blue_speed=0.5, - red_own=0.0, - blue_own=1.0, - ) - - -def test_logged_info__pick_half_the_time_half_blue_half_red(): - p_red_pos = [[0, 0], [0, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], None, [1, 1], None] - c_blue_pos = [None, [1, 1], None, [1, 1]] - max_steps, grid_size = 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, grid_size) - assert_logger_buffer_size(env, n_steps=0) - overwrite_pos(env, p_red_pos[0], p_blue_pos[0], c_red_pos[0], c_blue_pos[0]) - - assert_info( - n_steps, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.5, - blue_speed=0.5, - red_own=0.5, - blue_own=0.5, - ) - - -def test_observations_are_invariant_to_the_player_trained_in_reset(): - p_red_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [0, 0], - [1, 1], - [2, 0], - [0, 1], - [2, 2], - [1, 2], - ] - p_blue_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [1, 1], - [0, 0], - [0, 1], - [2, 0], - [1, 2], - [2, 2], - ] - p_red_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - c_red_pos = [[1, 1], None, [0, 1], None, None, [2, 2], [0, 0], None, None, [2, 1]] - c_blue_pos = [None, [1, 1], None, [0, 1], [2, 2], None, None, [0, 0], [2, 1], None] - max_steps, grid_size = 10, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - _ = env.reset() - - step_i = 0 - overwrite_pos( - env, - p_red_pos[step_i], - p_blue_pos[step_i], - c_red_pos[step_i], - c_blue_pos[step_i], - ) - - for _ in range(n_steps): - step_i += 1 - actions = { - "player_red": p_red_act[step_i - 1], - "player_blue": p_blue_act[step_i - 1], - } - _, _, _, _, _ = env.step(actions) - - if step_i == max_steps: - break - - overwrite_pos( - env, - p_red_pos[step_i], - p_blue_pos[step_i], - c_red_pos[step_i], - c_blue_pos[step_i], - ) - - -def assert_obs_is_symmetrical(obs, env): - assert np.all(obs[env.players_ids[0]][..., 0] == obs[env.players_ids[1]][..., 1]) - assert np.all(obs[env.players_ids[1]][..., 0] == obs[env.players_ids[0]][..., 1]) - assert np.all(obs[env.players_ids[0]][..., 2] == obs[env.players_ids[1]][..., 3]) - assert np.all(obs[env.players_ids[1]][..., 2] == obs[env.players_ids[0]][..., 3]) - - -def test_observations_are_invariant_to_the_player_trained_in_step(): - p_red_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [0, 0], - [1, 1], - [2, 0], - [0, 1], - [2, 2], - [1, 2], - ] - p_blue_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [1, 1], - [0, 0], - [0, 1], - [2, 0], - [1, 2], - [2, 2], - ] - p_red_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - c_red_pos = [[1, 1], None, [0, 1], None, None, [2, 2], [0, 0], None, None, [2, 1]] - c_blue_pos = [None, [1, 1], None, [0, 1], [2, 2], None, None, [0, 0], [2, 1], None] - max_steps, grid_size = 10, 3 - n_steps = max_steps - envs = init_several_env(max_steps, grid_size) - - for env_i, env in enumerate(envs): - _ = env.reset() - step_i = 0 - overwrite_pos( - env, - p_red_pos[step_i], - p_blue_pos[step_i], - c_red_pos[step_i], - c_blue_pos[step_i], - ) - - for _ in range(n_steps): - step_i += 1 - actions = { - "player_red": p_red_act[step_i - 1], - "player_blue": p_blue_act[step_i - 1], - } - obs, reward, done, truncated, info = env.step(actions) - - # assert observations are symmetrical respective to the actions - if step_i % 2 == 1: - obs_step_odd = obs - elif step_i % 2 == 0: - assert np.all( - obs[env.players_ids[0]] == obs_step_odd[env.players_ids[1]] - ) - assert np.all( - obs[env.players_ids[1]] == obs_step_odd[env.players_ids[0]] - ) - - if step_i == max_steps: - break - - overwrite_pos( - env, - p_red_pos[step_i], - p_blue_pos[step_i], - c_red_pos[step_i], - c_blue_pos[step_i], - ) diff --git a/rllib/examples/env/tests/test_coin_game_vectorized_env.py b/rllib/examples/env/tests/test_coin_game_vectorized_env.py deleted file mode 100644 index 5ce2acd8b8102..0000000000000 --- a/rllib/examples/env/tests/test_coin_game_vectorized_env.py +++ /dev/null @@ -1,906 +0,0 @@ -########## -# Contribution by the Center on Long-Term Risk: -# https://github.com/longtermrisk/marltoolbox -########## -import copy -import random - -import numpy as np -from ray.rllib.examples.env.coin_game_vectorized_env import ( - VectorizedCoinGame, - AsymVectorizedCoinGame, -) -from ray.rllib.examples.env.tests.test_coin_game_non_vectorized_env import ( - assert_obs_is_symmetrical, -) - -# TODO add tests for grid_size != 3 - - -def test_reset(): - max_steps, batch_size, grid_size = 20, 5, 3 - envs = init_several_env(max_steps, batch_size, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - -def init_several_env(max_steps, batch_size, grid_size): - coin_game = init_env(max_steps, batch_size, VectorizedCoinGame, grid_size) - asymm_coin_game = init_env(max_steps, batch_size, AsymVectorizedCoinGame, grid_size) - return [coin_game, asymm_coin_game] - - -def init_env(max_steps, batch_size, env_class, seed=None, grid_size=3): - config = { - "max_steps": max_steps, - "batch_size": batch_size, - "grid_size": grid_size, - } - env = env_class(config) - env.seed(seed) - return env - - -def check_obs(obs, batch_size, grid_size): - assert len(obs) == 2, "two players" - for i in range(batch_size): - for key, player_obs in obs.items(): - assert player_obs.shape == (batch_size, grid_size, grid_size, 4) - assert ( - player_obs[i, ..., 0].sum() == 1.0 - ), f"observe 1 player red in grid: {player_obs[i, ..., 0]}" - assert ( - player_obs[i, ..., 1].sum() == 1.0 - ), f"observe 1 player blue in grid: {player_obs[i, ..., 1]}" - assert ( - player_obs[i, ..., 2:].sum() == 1.0 - ), f"observe 1 coin in grid: {player_obs[i, ..., 0]}" - - -def assert_logger_buffer_size(env, n_steps): - assert len(env.red_pick) == n_steps - assert len(env.red_pick_own) == n_steps - assert len(env.blue_pick) == n_steps - assert len(env.blue_pick_own) == n_steps - - -def test_step(): - max_steps, batch_size, grid_size = 20, 5, 3 - envs = init_several_env(max_steps, batch_size, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - actions = { - policy_id: [ - random.randint(0, env.NUM_ACTIONS - 1) for _ in range(batch_size) - ] - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=1) - assert not done["__all__"] - - -def test_multiple_steps(): - max_steps, batch_size, grid_size = 20, 5, 3 - n_steps = int(max_steps * 0.75) - envs = init_several_env(max_steps, batch_size, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - for step_i in range(1, n_steps, 1): - actions = { - policy_id: [ - random.randint(0, env.NUM_ACTIONS - 1) for _ in range(batch_size) - ] - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=step_i) - assert not done["__all__"] - - -def test_multiple_episodes(): - max_steps, batch_size, grid_size = 20, 100, 3 - n_steps = int(max_steps * 8.25) - envs = init_several_env(max_steps, batch_size, grid_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - policy_id: [ - random.randint(0, env.NUM_ACTIONS - 1) for _ in range(batch_size) - ] - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=step_i) - assert not done["__all__"] or (step_i == max_steps and done["__all__"]) - if done["__all__"]: - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - step_i = 0 - - -def overwrite_pos( - step_i, - batch_deltas, - n_steps_in_epi, - env, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, -): - assert len(p_red_pos) == n_steps_in_epi - assert len(p_blue_pos) == n_steps_in_epi - assert len(c_red_pos) == n_steps_in_epi - assert len(c_blue_pos) == n_steps_in_epi - env.red_coin = [ - 0 if c_red_pos[(step_i + delta) % n_steps_in_epi] is None else 1 - for delta in batch_deltas - ] - coin_pos = [ - c_blue_pos[(step_i + delta) % n_steps_in_epi] - if c_red_pos[(step_i + delta) % n_steps_in_epi] is None - else c_red_pos[(step_i + delta) % n_steps_in_epi] - for delta in batch_deltas - ] - - env.red_pos = [ - p_red_pos[(step_i + delta) % n_steps_in_epi] for delta in batch_deltas - ] - env.blue_pos = [ - p_blue_pos[(step_i + delta) % n_steps_in_epi] for delta in batch_deltas - ] - env.coin_pos = coin_pos - - env.red_pos = np.array(env.red_pos) - env.blue_pos = np.array(env.blue_pos) - env.coin_pos = np.array(env.coin_pos) - env.red_coin = np.array(env.red_coin) - - -def assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - n_steps_in_epi, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed, - blue_speed, - red_own, - blue_own, -): - step_i = 0 - - for _ in range(n_steps): - overwrite_pos( - step_i, - batch_deltas, - n_steps_in_epi, - env, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - ) - actions = { - "player_red": [ - p_red_act[(step_i + delta) % n_steps_in_epi] for delta in batch_deltas - ], - "player_blue": [ - p_blue_act[(step_i + delta) % n_steps_in_epi] for delta in batch_deltas - ], - } - step_i += 1 - - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=step_i) - assert not done["__all__"] or (step_i == n_steps_in_epi and done["__all__"]) - - if done["__all__"]: - assert info["player_red"]["pick_speed"] == red_speed - assert info["player_blue"]["pick_speed"] == blue_speed - - if red_own is None: - assert "pick_own_color" not in info["player_red"] - else: - assert info["player_red"]["pick_own_color"] == red_own - if blue_own is None: - assert "pick_own_color" not in info["player_blue"] - else: - assert info["player_blue"]["pick_own_color"] == blue_own - - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - step_i = 0 - - -def test_logged_info_no_picking(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env in envs: - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=0.0, - red_own=None, - blue_own=None, - ) - - -def test_logged_info__red_pick_red_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=1.0, - blue_own=None, - ) - - -def test_logged_info__blue_pick_red_all_the_time(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=1.0, - red_own=None, - blue_own=0.0, - ) - - -def test_logged_info__blue_pick_blue_all_the_time(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.0, - blue_speed=1.0, - red_own=None, - blue_own=1.0, - ) - - -def test_logged_info__red_pick_blue_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=0.0, - blue_own=None, - ) - - -def test_logged_info__red_pick_blue_all_the_time_wt_difference_in_actions(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[0, 0], [0, 0], [0, 1], [0, 1]] - p_red_act = [0, 1, 2, 3] - p_blue_act = [0, 1, 2, 3] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 2], [2, 0], [0, 0]] - max_steps, batch_size, grid_size = 4, 4, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=0.0, - red_own=0.0, - blue_own=None, - ) - - -def test_logged_info__both_pick_blue_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=1.0, - red_own=0.0, - blue_own=1.0, - ) - - -def test_logged_info__both_pick_red_all_the_time(): - p_red_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [1, 0], [1, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=1.0, - blue_speed=1.0, - red_own=1.0, - blue_own=0.0, - ) - - -def test_logged_info__both_pick_red_half_the_time(): - p_red_pos = [[0, 0], [0, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - c_blue_pos = [None, None, None, None] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.5, - blue_speed=0.5, - red_own=1.0, - blue_own=0.0, - ) - - -def test_logged_info__both_pick_blue_half_the_time(): - p_red_pos = [[0, 0], [0, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.5, - blue_speed=0.5, - red_own=0.0, - blue_own=1.0, - ) - - -def test_logged_info__both_pick_blue(): - p_red_pos = [[0, 0], [0, 0], [0, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [None, None, None, None] - c_blue_pos = [[1, 1], [1, 1], [1, 1], [1, 1]] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.25, - blue_speed=0.5, - red_own=0.0, - blue_own=1.0, - ) - - -def test_logged_info__pick_half_the_time_half_blue_half_red(): - p_red_pos = [[0, 0], [0, 0], [1, 0], [1, 0]] - p_blue_pos = [[1, 0], [1, 0], [0, 0], [0, 0]] - p_red_act = [0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0] - c_red_pos = [[1, 1], None, [1, 1], None] - c_blue_pos = [None, [1, 1], None, [1, 1]] - max_steps, batch_size, grid_size = 4, 28, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = np.random.randint(0, max_steps - 1, size=batch_size) - - for env_i, env in enumerate(envs): - obs, info = env.reset() - check_obs(obs, batch_size, grid_size) - assert_logger_buffer_size(env, n_steps=0) - - assert_info( - batch_deltas, - n_steps, - batch_size, - p_red_act, - p_blue_act, - env, - grid_size, - max_steps, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - red_speed=0.5, - blue_speed=0.5, - red_own=0.5, - blue_own=0.5, - ) - - -def test_get_and_set_env_state(): - max_steps, batch_size, grid_size = 20, 100, 3 - n_steps = int(max_steps * 8.25) - envs = init_several_env(max_steps, batch_size, grid_size) - - for env in envs: - obs, info = env.reset() - initial_env_state = env._save_env() - initial_env_state_saved = copy.deepcopy(initial_env_state) - env_initial = copy.deepcopy(env) - - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - policy_id: [ - random.randint(0, env.NUM_ACTIONS - 1) for _ in range(batch_size) - ] - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - - assert all( - v == initial_env_state_saved[k] - if not isinstance(v, np.ndarray) - else (v == initial_env_state_saved[k]).all() - for k, v in initial_env_state.items() - ) - env_state_after_step = env._save_env() - env_after_step = copy.deepcopy(env) - - env._load_env(initial_env_state) - env_vars, env_initial_vars = vars(env), vars(env_initial) - env_vars.pop("np_random", None) - env_initial_vars.pop("np_random", None) - assert all( - v == env_initial_vars[k] - if not isinstance(v, np.ndarray) - else (v == env_initial_vars[k]).all() - for k, v in env_vars.items() - ) - - env._load_env(env_state_after_step) - env_vars, env_after_step_vars = vars(env), vars(env_after_step) - env_vars.pop("np_random", None) - env_after_step_vars.pop("np_random", None) - assert all( - v == env_after_step_vars[k] - if not isinstance(v, np.ndarray) - else (v == env_after_step_vars[k]).all() - for k, v in env_vars.items() - ) - - if done["__all__"]: - _, _ = env.reset() - step_i = 0 - - -def test_observations_are_invariant_to_the_player_trained_wt_step(): - p_red_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [0, 0], - [1, 1], - [2, 0], - [0, 1], - [2, 2], - [1, 2], - ] - p_blue_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [1, 1], - [0, 0], - [0, 1], - [2, 0], - [1, 2], - [2, 2], - ] - p_red_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - c_red_pos = [[1, 1], None, [0, 1], None, None, [2, 2], [0, 0], None, None, [2, 1]] - c_blue_pos = [None, [1, 1], None, [0, 1], [2, 2], None, None, [0, 0], [2, 1], None] - max_steps, batch_size, grid_size = 10, 52, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = [ - i % max_steps if i % 2 == 0 else i % max_steps - 1 for i in range(batch_size) - ] - - for env_i, env in enumerate(envs): - _, _ = env.reset() - step_i = 0 - - for _ in range(n_steps): - overwrite_pos( - step_i, - batch_deltas, - max_steps, - env, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - ) - actions = { - "player_red": [ - p_red_act[(step_i + delta) % max_steps] for delta in batch_deltas - ], - "player_blue": [ - p_blue_act[(step_i + delta) % max_steps] for delta in batch_deltas - ], - } - obs, reward, done, truncated, info = env.step(actions) - - step_i += 1 - # assert observations are symmetrical respective to the actions - if step_i % 2 == 1: - obs_step_odd = obs - elif step_i % 2 == 0: - assert np.all( - obs[env.players_ids[0]] == obs_step_odd[env.players_ids[1]] - ) - assert np.all( - obs[env.players_ids[1]] == obs_step_odd[env.players_ids[0]] - ) - - if step_i == max_steps: - break - - -def test_observations_are_invariant_to_the_player_trained_wt_reset(): - p_red_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [0, 0], - [1, 1], - [2, 0], - [0, 1], - [2, 2], - [1, 2], - ] - p_blue_pos = [ - [0, 0], - [0, 0], - [1, 1], - [1, 1], - [1, 1], - [0, 0], - [0, 1], - [2, 0], - [1, 2], - [2, 2], - ] - p_red_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - p_blue_act = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - c_red_pos = [[1, 1], None, [0, 1], None, None, [2, 2], [0, 0], None, None, [2, 1]] - c_blue_pos = [None, [1, 1], None, [0, 1], [2, 2], None, None, [0, 0], [2, 1], None] - max_steps, batch_size, grid_size = 10, 52, 3 - n_steps = max_steps - envs = init_several_env(max_steps, batch_size, grid_size) - - batch_deltas = [ - i % max_steps if i % 2 == 0 else i % max_steps - 1 for i in range(batch_size) - ] - - for env_i, env in enumerate(envs): - obs, info = env.reset() - assert_obs_is_symmetrical(obs, env) - step_i = 0 - - for _ in range(n_steps): - overwrite_pos( - step_i, - batch_deltas, - max_steps, - env, - p_red_pos, - p_blue_pos, - c_red_pos, - c_blue_pos, - ) - actions = { - "player_red": [ - p_red_act[(step_i + delta) % max_steps] for delta in batch_deltas - ], - "player_blue": [ - p_blue_act[(step_i + delta) % max_steps] for delta in batch_deltas - ], - } - _, _, _, _, _ = env.step(actions) - - step_i += 1 - - if step_i == max_steps: - break diff --git a/rllib/examples/env/tests/test_matrix_sequential_social_dilemma.py b/rllib/examples/env/tests/test_matrix_sequential_social_dilemma.py deleted file mode 100644 index 1ed5a61299306..0000000000000 --- a/rllib/examples/env/tests/test_matrix_sequential_social_dilemma.py +++ /dev/null @@ -1,322 +0,0 @@ -########## -# Contribution by the Center on Long-Term Risk: -# https://github.com/longtermrisk/marltoolbox -########## -import random - -from ray.rllib.examples.env.matrix_sequential_social_dilemma import ( - IteratedPrisonersDilemma, - IteratedChicken, - IteratedStagHunt, - IteratedBoS, -) - -ENVS = [IteratedPrisonersDilemma, IteratedChicken, IteratedStagHunt, IteratedBoS] - - -def test_reset(): - max_steps = 20 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - -def init_env(max_steps, env_class, seed=None): - config = { - "max_steps": max_steps, - } - env = env_class(config) - env.seed(seed) - return env - - -def check_obs(obs, env): - assert len(obs) == 2, "two players" - for key, player_obs in obs.items(): - assert isinstance(player_obs, int) # .shape == (env.NUM_STATES) - assert player_obs < env.NUM_STATES - - -def assert_logger_buffer_size_two_players(env, n_steps): - assert len(env.cc_count) == n_steps - assert len(env.dd_count) == n_steps - assert len(env.cd_count) == n_steps - assert len(env.dc_count) == n_steps - - -def test_step(): - max_steps = 20 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=1) - assert not done["__all__"] - - -def test_multiple_steps(): - max_steps = 20 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 0.75) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - for step_i in range(1, n_steps, 1): - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=step_i) - assert not done["__all__"] - - -def test_multiple_episodes(): - max_steps = 20 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=step_i) - assert not done["__all__"] or (step_i == max_steps and done["__all__"]) - if done["__all__"]: - obs, info = env.reset() - check_obs(obs, env) - step_i = 0 - - -def assert_info(n_steps, p_row_act, p_col_act, env, max_steps, CC, DD, CD, DC): - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - "player_row": p_row_act[step_i - 1], - "player_col": p_col_act[step_i - 1], - } - obs, reward, done, truncated, info = env.step(actions) - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=step_i) - assert not done["__all__"] or (step_i == max_steps and done["__all__"]) - - if done["__all__"]: - assert info["player_row"]["CC"] == CC - assert info["player_col"]["CC"] == CC - assert info["player_row"]["DD"] == DD - assert info["player_col"]["DD"] == DD - assert info["player_row"]["CD"] == CD - assert info["player_col"]["CD"] == CD - assert info["player_row"]["DC"] == DC - assert info["player_col"]["DC"] == DC - - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - step_i = 0 - - -def test_logged_info_full_CC(): - p_row_act = [0, 0, 0, 0] - p_col_act = [0, 0, 0, 0] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - assert_info( - n_steps, - p_row_act, - p_col_act, - env, - max_steps, - CC=1.0, - DD=0.0, - CD=0.0, - DC=0.0, - ) - - -def test_logged_info_full_DD(): - p_row_act = [1, 1, 1, 1] - p_col_act = [1, 1, 1, 1] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - assert_info( - n_steps, - p_row_act, - p_col_act, - env, - max_steps, - CC=0.0, - DD=1.0, - CD=0.0, - DC=0.0, - ) - - -def test_logged_info_full_CD(): - p_row_act = [0, 0, 0, 0] - p_col_act = [1, 1, 1, 1] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - assert_info( - n_steps, - p_row_act, - p_col_act, - env, - max_steps, - CC=0.0, - DD=0.0, - CD=1.0, - DC=0.0, - ) - - -def test_logged_info_full_DC(): - p_row_act = [1, 1, 1, 1] - p_col_act = [0, 0, 0, 0] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - assert_info( - n_steps, - p_row_act, - p_col_act, - env, - max_steps, - CC=0.0, - DD=0.0, - CD=0.0, - DC=1.0, - ) - - -def test_logged_info_mix_CC_DD(): - p_row_act = [0, 1, 1, 1] - p_col_act = [0, 1, 1, 1] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - assert_info( - n_steps, - p_row_act, - p_col_act, - env, - max_steps, - CC=0.25, - DD=0.75, - CD=0.0, - DC=0.0, - ) - - -def test_logged_info_mix_CD_CD(): - p_row_act = [1, 0, 1, 0] - p_col_act = [0, 1, 0, 1] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = int(max_steps * 8.25) - - for env in env_all: - obs, info = env.reset() - check_obs(obs, env) - assert_logger_buffer_size_two_players(env, n_steps=0) - - assert_info( - n_steps, - p_row_act, - p_col_act, - env, - max_steps, - CC=0.0, - DD=0.0, - CD=0.5, - DC=0.5, - ) - - -def test_observations_are_invariant_to_the_player_trained(): - p_row_act = [0, 1, 1, 0] - p_col_act = [0, 1, 0, 1] - max_steps = 4 - env_all = [init_env(max_steps, env_class) for env_class in ENVS] - n_steps = 4 - - for env in env_all: - _, _ = env.reset() - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - "player_row": p_row_act[step_i - 1], - "player_col": p_col_act[step_i - 1], - } - obs, reward, done, truncated, info = env.step(actions) - # assert observations are symmetrical respective to the actions - if step_i == 1: - assert obs[env.players_ids[0]] == obs[env.players_ids[1]] - elif step_i == 2: - assert obs[env.players_ids[0]] == obs[env.players_ids[1]] - elif step_i == 3: - obs_step_3 = obs - elif step_i == 4: - assert obs[env.players_ids[0]] == obs_step_3[env.players_ids[1]] - assert obs[env.players_ids[1]] == obs_step_3[env.players_ids[0]] diff --git a/rllib/examples/env/tests/test_wrappers.py b/rllib/examples/env/tests/test_wrappers.py deleted file mode 100644 index 3ab63b93d455c..0000000000000 --- a/rllib/examples/env/tests/test_wrappers.py +++ /dev/null @@ -1,58 +0,0 @@ -import random - -import numpy as np -from ray.rllib.examples.env.coin_game_non_vectorized_env import CoinGame, AsymCoinGame -from ray.rllib.env.wrappers.uncertainty_wrappers import ( - add_RewardUncertaintyEnvClassWrapper, -) - - -def init_env(max_steps, env_class, seed=None, grid_size=3): - config = { - "max_steps": max_steps, - "grid_size": grid_size, - } - env = env_class(config) - env.seed(seed) - - return env - - -def test_add_RewardUncertaintyEnvClassWrapper(): - max_steps, grid_size = 20, 3 - n_steps = int(max_steps * 8.25) - reward_uncertainty_mean, reward_uncertainty_std = 10, 1 - MyCoinGame = add_RewardUncertaintyEnvClassWrapper( - CoinGame, reward_uncertainty_std, reward_uncertainty_mean - ) - MyAsymCoinGame = add_RewardUncertaintyEnvClassWrapper( - AsymCoinGame, reward_uncertainty_std, reward_uncertainty_mean - ) - coin_game = init_env(max_steps, MyCoinGame, grid_size) - asymm_coin_game = init_env(max_steps, MyAsymCoinGame, grid_size) - - all_rewards = [] - for env in [coin_game, asymm_coin_game]: - _ = env.reset() - - step_i = 0 - for _ in range(n_steps): - step_i += 1 - actions = { - policy_id: random.randint(0, env.NUM_ACTIONS - 1) - for policy_id in env.players_ids - } - obs, reward, done, truncated, info = env.step(actions) - print("reward", reward) - all_rewards.append(reward[env.player_red_id]) - all_rewards.append(reward[env.player_blue_id]) - - if done["__all__"]: - _ = env.reset() - step_i = 0 - - assert np.array(all_rewards).mean() > reward_uncertainty_mean - 1.0 - assert np.array(all_rewards).mean() < reward_uncertainty_mean + 1.0 - - assert np.array(all_rewards).std() > reward_uncertainty_std - 0.1 - assert np.array(all_rewards).std() < reward_uncertainty_mean + 0.1 diff --git a/rllib/examples/env/two_step_game.py b/rllib/examples/env/two_step_game.py index 5e7669edeacec..540b22b534b32 100644 --- a/rllib/examples/env/two_step_game.py +++ b/rllib/examples/env/two_step_game.py @@ -108,15 +108,16 @@ def __init__(self, env_config): env = TwoStepGame(env_config) tuple_obs_space = Tuple([env.observation_space, env.observation_space]) tuple_act_space = Tuple([env.action_space, env.action_space]) - + self._agent_ids = {"agents"} self.env = env.with_agent_groups( groups={"agents": [0, 1]}, obs_space=tuple_obs_space, act_space=tuple_act_space, ) - self.observation_space = self.env.observation_space - self.action_space = self.env.action_space - self._agent_ids = {"agents"} + self.observation_space = Dict({"agents": self.env.observation_space}) + self._obs_space_in_preferred_format = True + self.action_space = Dict({"agents": self.env.action_space}) + self._action_space_in_preferred_format = True self._skip_env_checking = True def reset(self, *, seed=None, options=None): diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py new file mode 100644 index 0000000000000..3651c692875fd --- /dev/null +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -0,0 +1,144 @@ +"""Example of customizing the evaluation procedure for an RLlib algorithm. + +Note, that you should only choose to provide a custom eval function, in case the already +built-in eval options are not sufficient. Normally, though, RLlib's eval utilities +that come with each Algorithm are enough to properly evaluate the learning progress +of your Algorithm. + +This script uses the SimpleCorridor environment, a simple 1D gridworld, in which +the agent can only walk left (action=0) or right (action=1). The goal is at the end of +the (1D) corridor. The env exposes an API to change the length of the corridor +on-the-fly. We use this API here to extend the size of the corridor for the evaluation +runs. + +We define a custom evaluation method that does the following: +- It changes the corridor length of all environments used on the evaluation EnvRunners. +- It runs a defined number of episodes for evaluation purposes. +- It collects the metrics from those runs, summarizes these metrics and returns them. + +""" +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.evaluation.metrics import summarize_episodes +from ray.rllib.evaluation.worker_set import WorkerSet +from ray.rllib.examples.env.simple_corridor import SimpleCorridor +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.rllib.utils.typing import ResultDict +from ray.tune.registry import get_trainable_cls + + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=0.7, default_timesteps=50000 +) +parser.add_argument("--evaluation-parallel-to-training", action="store_true") +parser.add_argument("--no-custom-eval", action="store_true") +parser.add_argument("--corridor-length-training", type=int, default=10) +parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20) +parser.add_argument("--corridor-length-eval-worker-2", type=int, default=30) + + +def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> ResultDict: + """Example of a custom evaluation function. + + Args: + algorithm: Algorithm class to evaluate. + eval_workers: Evaluation WorkerSet. + + Returns: + metrics: Evaluation metrics dict. + """ + # Set different env settings for each (eval) EnvRunner. Here we use the EnvRunner's + # `worker_index` property to figure out the actual length. + # Loop through all workers and all sub-envs (gym.Env) on each worker and call the + # `set_corridor_length` method on these. + eval_workers.foreach_worker( + func=lambda worker: ( + env.set_corridor_length( + args.corridor_length_eval_worker_1 + if worker.worker_index == 1 + else args.corridor_length_eval_worker_2 + ) + for env in worker.env.envs + ) + ) + + # Collect metrics results collected by eval workers in this list for later + # processing. + rollout_metrics = [] + + # For demonstration purposes, run through some arbitrary number of evaluation + # round within this one call. Note that this function is called once per + # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()` + # (which may be called manually by the user). + for i in range(3): + print(f"Training iteration {algorithm.iteration} -> evaluation round {i}") + # Sample episodes from the EnvRunners AND have them return only the thus + # collected metrics. + metrics_all_workers = eval_workers.foreach_worker( + # Return only the metrics, NOT the sampled episodes (we don't need them + # anymore). + func=lambda worker: (worker.sample(), worker.get_metrics())[1], + local_worker=False, + ) + for metrics_per_worker in metrics_all_workers: + rollout_metrics.extend(metrics_per_worker) + + # You can compute metrics from the episodes manually, or use the + # convenient `summarize_episodes()` utility: + eval_results = summarize_episodes(rollout_metrics) + + return eval_results + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + # For training, we use a corridor length of n. For evaluation, we use different + # values, depending on the eval worker index (1 or 2). + .environment( + SimpleCorridor, + env_config={"corridor_length": args.corridor_length_training}, + ) + .evaluation( + # Do we use the custom eval function defined above? + custom_evaluation_function=( + None if args.no_custom_eval else custom_eval_function + ), + # Number of eval EnvRunners to use. + evaluation_num_workers=2, + # Enable evaluation, once per training iteration. + evaluation_interval=1, + # Run 10 episodes each time evaluation runs (OR "auto" if parallel to + # training). + evaluation_duration="auto" if args.evaluation_parallel_to_training else 10, + # Evaluate parallelly to training? + evaluation_parallel_to_training=args.evaluation_parallel_to_training, + # Override the env settings for the eval workers. + # Note, though, that this setting here is only used in case --no-custom-eval + # is set, b/c in case the custom eval function IS used, we override the + # length of the eval environments in that custom function, so this setting + # here is simply ignored. + evaluation_config=AlgorithmConfig.overrides( + env_config={"corridor_length": args.corridor_length_training * 2}, + ), + ) + ) + + stop = { + "training_iteration": args.stop_iters, + "evaluation/sampler_results/episode_reward_mean": args.stop_reward, + "timesteps_total": args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + success_metric="evaluation/sampler_results/episode_reward_mean", + ) diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index 1b1dedad82e42..8bbe752cf5475 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -180,4 +180,4 @@ def on_train_result(self, *, algorithm, result, **kwargs): "timesteps_total": args.stop_timesteps, } - run_rllib_example_script_experiment(config, args, stop) + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/examples/gpu_training/__init__.py b/rllib/examples/gpu_training/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/examples/learner/multi_agent_cartpole_ppo.py b/rllib/examples/learner/multi_agent_cartpole_ppo.py deleted file mode 100644 index b2ccc6c244aa4..0000000000000 --- a/rllib/examples/learner/multi_agent_cartpole_ppo.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Simple example of setting up a multi-agent policy mapping. - -Control the number of agents and policies via --num-agents and --num-policies. - -This works with hundreds of agents and policies, but note that initializing -many TF policies will take some time. - -Also, TF evals might slow down with large numbers of policies. To debug TF -execution, set the TF_TIMELINE_DIR environment variable. -""" - -import argparse -import os -import random - -import ray -from ray import tune, air -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.examples.env.multi_agent import MultiAgentCartPole -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import check_learning_achieved - - -tf1, tf, tfv = try_import_tf() - -parser = argparse.ArgumentParser() - -parser.add_argument("--num-agents", type=int, default=4) -parser.add_argument("--num-policies", type=int, default=2) -parser.add_argument( - "--framework", - choices=["tf2", "torch"], # tf will be deprecated with the new Learner stack - default="torch", - help="The DL framework specifier.", -) - -parser.add_argument( - "--num-gpus", - type=int, - default=int(os.environ.get("RLLIB_NUM_GPUS", "0")), - help="Number of GPUs to use for training.", -) - -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) - -parser.add_argument( - "--stop-iters", type=int, default=20, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=50000, help="Number of timesteps to train." -) - -parser.add_argument( - "--stop-reward-per-agent", - type=float, - default=150.0, - help="Min. reward per agent at which we stop training.", -) - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init() - - # Each policy can have a different configuration (including custom model). - def gen_policy(i): - gammas = [0.95, 0.99] - # just change the gammas between the two policies. - # changing the module is not a critical part of this example. - # the important part is that the policies are different. - config = { - "gamma": gammas[i % len(gammas)], - } - - return PolicySpec(config=config) - - # Setup PPO with an ensemble of `num_policies` different policies. - policies = {"policy_{}".format(i): gen_policy(i) for i in range(args.num_policies)} - policy_ids = list(policies.keys()) - - def policy_mapping_fn(agent_id, episode, worker, **kwargs): - pol_id = random.choice(policy_ids) - return pol_id - - config = ( - PPOConfig() - .experimental(_enable_new_api_stack=True) - .rollouts(rollout_fragment_length="auto", num_rollout_workers=3) - .environment(MultiAgentCartPole, env_config={"num_agents": args.num_agents}) - .framework(args.framework) - .training(num_sgd_iter=10, sgd_minibatch_size=2**9, train_batch_size=2**12) - .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=int(args.num_gpus > 0), - ) - ) - - stop_reward = args.stop_reward_per_agent * args.num_agents - stop = { - "episode_reward_mean": stop_reward, - "timesteps_total": args.stop_timesteps, - "training_iteration": args.stop_iters, - } - - results = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=3), - ).fit() - - if args.as_test: - check_learning_achieved(results, stop_reward) - ray.shutdown() diff --git a/rllib/examples/multi_agent_and_self_play/__init__.py b/rllib/examples/multi_agent_and_self_play/__init__.py index d24ddcc9d4fa3..e69de29bb2d1d 100644 --- a/rllib/examples/multi_agent_and_self_play/__init__.py +++ b/rllib/examples/multi_agent_and_self_play/__init__.py @@ -1,22 +0,0 @@ -from ray.rllib.examples.multi_agent_and_self_play.self_play_callback import ( - SelfPlayCallback, -) -from ray.rllib.examples.multi_agent_and_self_play.self_play_league_based_callback import ( # noqa - SelfPlayLeagueBasedCallback, -) -from ray.rllib.examples.multi_agent_and_self_play.self_play_callback_old_api_stack import ( # noqa - SelfPlayCallbackOldAPIStack, -) -from ray.rllib.examples.multi_agent_and_self_play.self_play_league_based_callback_old_api_stack import ( # noqa - SelfPlayLeagueBasedCallbackOldAPIStack, -) -from ray.rllib.examples.multi_agent_and_self_play.utils import ask_user_for_action - - -__all__ = [ - "ask_user_for_action", - "SelfPlayCallback", - "SelfPlayLeagueBasedCallback", - "SelfPlayCallbackOldAPIStack", - "SelfPlayLeagueBasedCallbackOldAPIStack", -] diff --git a/rllib/examples/multi_agent_and_self_play/custom_heuristic_policy.py b/rllib/examples/multi_agent_and_self_play/custom_heuristic_policy.py new file mode 100644 index 0000000000000..6c79af01cc3d3 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/custom_heuristic_policy.py @@ -0,0 +1,100 @@ +"""Example of running a custom heuristic (hand-coded) policy alongside trainable ones. + +This example has two RLModules (as action computing policies): + (1) one trained by a PPOLearner + (2) one hand-coded policy that acts at random in the env (doesn't learn). + +The environment is MultiAgentCartPole, in which there are n agents both policies + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see the PPO policy ("learnable_policy") does much +better than "random": + ++-------------------+------------+----------+------+----------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|-------------------+------------+----------+------+----------------+ +| PPO_multi_agen... | TERMINATED | 127. ... | 20 | 58.646 | ++-------------------+------------+----------+------+----------------+ + ++--------+-------------------+-----------------+--------------------+ +| ts | combined reward | reward random | reward | +| | | | learnable_policy | ++--------+-------------------+-----------------+--------------------| +| 80000 | 481.26 | 78.41 | 464.41 | ++--------+-------------------+-----------------+--------------------+ +""" + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.examples.env.multi_agent import MultiAgentCartPole +from ray.rllib.examples.rl_module.random_rl_module import RandomRLModule +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import register_env + + +parser = add_rllib_example_script_args( + default_iters=20, default_reward=500.0, default_timesteps=100000 +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Simple environment with n independent cartpole entities. + register_env( + "multi_agent_cartpole", + lambda _: MultiAgentCartPole({"num_agents": args.num_agents}), + ) + + base_config = ( + PPOConfig() + .environment("multi_agent_cartpole") + .multi_agent( + policies={"learnable_policy", "random"}, + # Map to either random behavior or PPO learning behavior based on + # the agent's ID. + policy_mapping_fn=lambda agent_id, *args, **kwargs: [ + "learnable_policy", + "random", + ][agent_id % 2], + # We need to specify this here, b/c the `forward_train` method of + # `RandomRLModule` (ModuleID="random") throws a not-implemented error. + policies_to_train=["learnable_policy"], + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "learnable_policy": SingleAgentRLModuleSpec(), + "random": SingleAgentRLModuleSpec(module_class=RandomRLModule), + } + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/different_spaces_for_agents.py b/rllib/examples/multi_agent_and_self_play/different_spaces_for_agents.py new file mode 100644 index 0000000000000..ffbc45255380f --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/different_spaces_for_agents.py @@ -0,0 +1,116 @@ +""" +Example showing how to create a multi-agent env, in which the different agents +have different observation and action spaces. + +These spaces do NOT necessarily have to be specified manually by the user. Instead, +RLlib tries to automatically infer them from the env provided spaces dicts +(agentID -> obs/act space) and the policy mapping fn (mapping agent IDs to policy IDs). + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + +import gymnasium as gym + +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +class BasicMultiAgentMultiSpaces(MultiAgentEnv): + """A simple multi-agent example environment where agents have different spaces. + + agent0: obs=(10,), act=Discrete(2) + agent1: obs=(20,), act=Discrete(3) + + The logic of the env doesn't really matter for this example. The point of this env + is to show how to use multi-agent envs, in which the different agents utilize + different obs- and action spaces. + """ + + def __init__(self, config=None): + self.agents = {"agent0", "agent1"} + self._agent_ids = set(self.agents) + + self.terminateds = set() + self.truncateds = set() + + # Provide full (preferred format) observation- and action-spaces as Dicts + # mapping agent IDs to the individual agents' spaces. + self._obs_space_in_preferred_format = True + self.observation_space = gym.spaces.Dict( + { + "agent0": gym.spaces.Box(low=-1.0, high=1.0, shape=(10,)), + "agent1": gym.spaces.Box(low=-1.0, high=1.0, shape=(20,)), + } + ) + self._action_space_in_preferred_format = True + self.action_space = gym.spaces.Dict( + {"agent0": gym.spaces.Discrete(2), "agent1": gym.spaces.Discrete(3)} + ) + + super().__init__() + + def reset(self, *, seed=None, options=None): + self.terminateds = set() + self.truncateds = set() + return {i: self.observation_space[i].sample() for i in self.agents}, {} + + def step(self, action_dict): + obs, rew, terminated, truncated, info = {}, {}, {}, {}, {} + for i, action in action_dict.items(): + obs[i] = self.observation_space[i].sample() + rew[i] = 0.0 + terminated[i] = False + truncated[i] = False + info[i] = {} + terminated["__all__"] = len(self.terminateds) == len(self.agents) + truncated["__all__"] = len(self.truncateds) == len(self.agents) + return obs, rew, terminated, truncated, info + + +parser = add_rllib_example_script_args( + default_iters=10, default_reward=80.0, default_timesteps=10000 +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment(env=BasicMultiAgentMultiSpaces) + .training(train_batch_size=1024) + .multi_agent( + # Use a simple set of policy IDs. Spaces for the individual policies + # are inferred automatically using reverse lookup via the + # `policy_mapping_fn` and the env provided spaces for the different + # agents. Alternatively, you could use: + # policies: {main0: PolicySpec(...), main1: PolicySpec} + policies={"main0", "main1"}, + # Simple mapping fn, mapping agent0 to main0 and agent1 to main1. + policy_mapping_fn=(lambda aid, episode, **kw: f"main{aid[-1]}"), + # Only train main0. + policies_to_train=["main0"], + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/multi_agent_cartpole.py b/rllib/examples/multi_agent_and_self_play/multi_agent_cartpole.py new file mode 100644 index 0000000000000..fa2a1d7e2e27e --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/multi_agent_cartpole.py @@ -0,0 +1,66 @@ +"""Simple example of setting up an agent-to-module mapping function. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + +from ray.rllib.examples.env.multi_agent import MultiAgentCartPole +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=600.0, +) +parser.add_argument("--num-policies", type=int, default=2) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}), + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env" if args.num_agents > 0 else "CartPole-v1") + .rollouts( + # TODO (sven): MAEnvRunner does not support vectorized envs yet + # due to gym's env checkers and non-compatability with RLlib's + # MultiAgentEnv API. + num_envs_per_worker=1 + if args.num_agents > 0 + else 20, + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/pettingzoo_independent_learning.py b/rllib/examples/multi_agent_and_self_play/pettingzoo_independent_learning.py new file mode 100644 index 0000000000000..2b306176b0a69 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/pettingzoo_independent_learning.py @@ -0,0 +1,108 @@ +"""Runs the PettingZoo Waterworld env in RLlib using independent multi-agent learning. + +See: https://pettingzoo.farama.org/environments/sisl/waterworld/ +for more details on the environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +This works with hundreds of agents and policies, but note that initializing +many policies might take some time. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +The above options can reach a combined reward of 0.0 or more after about 500k env +timesteps. Keep in mind, though, that due to the separate value functions (and +learned policies in general), one agent's gain (in per-agent reward) might cause the +other agent's reward to decrease at the same time. However, over time, both agents +should simply improve. + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_a82fc_00000 | TERMINATED | 127.0.0.1:28346 | 124 | 363.599 | ++---------------------+------------+-----------------+--------+------------------+ + ++--------+-------------------+--------------------+--------------------+ +| ts | combined reward | reward pursuer_1 | reward pursuer_0 | ++--------+-------------------+--------------------+--------------------| +| 496000 | 2.24542 | -34.6869 | 36.9324 | ++--------+-------------------+--------------------+--------------------+ + +Note that the two agents (`pursuer_0` and `pursuer_1`) are optimized on the exact same +objective and thus differences in the rewards can be attributed to weight initialization +(and sampling randomness) only. +""" +from pettingzoo.sisl import waterworld_v4 + +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=1000000, + default_reward=0.0, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents > 0, "Must set --num-agents > 0 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Here, we use the "Agent Environment Cycle" (AEC) PettingZoo environment type. + # For a "Parallel" environment example, see the rock paper scissors examples + # in this same repository folder. + register_env("env", lambda _: PettingZooEnv(waterworld_v4.env())) + + # Policies are called just like the agents (exact 1:1 mapping). + policies = {f"pursuer_{i}" for i in range(args.num_agents)} + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .multi_agent( + policies=policies, + # Exact 1:1 mapping from AgentID to ModuleID. + policy_mapping_fn=(lambda aid, *args, **kwargs: aid), + ) + .training( + model={ + "vf_share_layers": True, + }, + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={p: SingleAgentRLModuleSpec() for p in policies}, + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/pettingzoo_parameter_sharing.py b/rllib/examples/multi_agent_and_self_play/pettingzoo_parameter_sharing.py new file mode 100644 index 0000000000000..55a1c173b3154 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/pettingzoo_parameter_sharing.py @@ -0,0 +1,105 @@ +"""Runs the PettingZoo Waterworld multi-agent env in RLlib using single policy learning. + +Other than the `pettingzoo_independent_learning.py` example (in this same folder), +this example simply trains a single policy (shared by all agents). + +See: https://pettingzoo.farama.org/environments/sisl/waterworld/ +for more details on the environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +This works with hundreds of agents and policies, but note that initializing +many policies might take some time. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +The above options can reach a combined reward of roughly ~0.0 after about 500k-1M env +timesteps. Keep in mind, though, that in this setup, the agents do not have the +opportunity to benefit from or even out other agents' mistakes (and behavior in general) +as everyone is using the same policy. Hence, this example learns a more generic policy, +which might be less specialized to certain "niche exploitation opportunities" inside +the env: + ++---------------------+----------+-----------------+--------+-----------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+----------+-----------------+--------+-----------------+ +| PPO_env_91f49_00000 | RUNNING | 127.0.0.1:63676 | 200 | 605.176 | ++---------------------+----------+-----------------+--------+-----------------+ + ++--------+-------------------+-------------+ +| ts | combined reward | reward p0 | ++--------+-------------------+-------------| +| 800000 | 0.323752 | 0.161876 | ++--------+-------------------+-------------+ +""" +from pettingzoo.sisl import waterworld_v4 + +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=1000000, + default_reward=0.0, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents > 0, "Must set --num-agents > 0 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Here, we use the "Agent Environment Cycle" (AEC) PettingZoo environment type. + # For a "Parallel" environment example, see the rock paper scissors examples + # in this same repository folder. + register_env("env", lambda _: PettingZooEnv(waterworld_v4.env())) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .multi_agent( + policies={"p0"}, + # All agents map to the exact same policy. + policy_mapping_fn=(lambda aid, *args, **kwargs: "p0"), + ) + .training( + model={ + "vf_share_layers": True, + }, + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={"p0": SingleAgentRLModuleSpec()}, + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/pettingzoo_shared_value_function.py b/rllib/examples/multi_agent_and_self_play/pettingzoo_shared_value_function.py new file mode 100644 index 0000000000000..7285b9bedc6dc --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/pettingzoo_shared_value_function.py @@ -0,0 +1,7 @@ +msg = """ +This script is NOT yet ready, but will be available soon at this location. It will +feature a MultiAgentRLModule with one shared value function and n policy heads for +cooperative multi-agent learning. +""" + +raise NotImplementedError(msg) diff --git a/rllib/examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py b/rllib/examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py new file mode 100644 index 0000000000000..be6e8529fe31c --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/rock_paper_scissors_heuristic_vs_learnt.py @@ -0,0 +1,147 @@ +"""A simple multi-agent env with two agents play rock paper scissors. + +This demonstrates running the following policies in competition: + Agent 1: heuristic policy of repeating the same move + OR: heuristic policy of beating the last opponent move + Agent 2: Simple, feedforward PPO policy + OR: PPO Policy with an LSTM network + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2 [--use-lstm]?` + +Without `--use-lstm`, Agent 2 should quickly reach a reward of ~7.0, always +beating the `always_same` policy, but only 50% of the time beating the `beat_last` +policy. + +With `--use-lstm`, Agent 2 should eventually(!) reach a reward of >9.0 (always +beating both the `always_same` policy and the `beat_last` policy). + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" +import random + +import gymnasium as gym +from pettingzoo.classic import rps_v2 + +from ray.rllib.connectors.env_to_module import ( + AddObservationsFromEpisodesToBatch, + FlattenObservations, + WriteObservationsToEpisodes, +) +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.rllib.examples.rl_module.classes import ( + AlwaysSameHeuristicRLM, + BeatLastHeuristicRLM, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=50, + default_timesteps=200000, + default_reward=6.0, +) +parser.add_argument( + "--use-lstm", + action="store_true", + help="Whether to use an LSTM wrapped module instead of a simple MLP one. With LSTM " + "the reward diff can reach 7.0, without only 5.0.", +) + + +register_env( + "RockPaperScissors", + lambda _: ParallelPettingZooEnv(rps_v2.parallel_env()), +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("RockPaperScissors") + .rollouts( + env_to_module_connector=lambda env: ( + AddObservationsFromEpisodesToBatch(), + # Only flatten obs for the learning RLModul + FlattenObservations(multi_agent=True, agent_ids={"player_0"}), + WriteObservationsToEpisodes(), + ), + ) + .multi_agent( + policies={"always_same", "beat_last", "learned"}, + # Let learning Policy always play against either heuristic one: + # `always_same` or `beat_last`. + policy_mapping_fn=lambda aid, episode: ( + "learned" + if aid == "player_0" + else random.choice(["always_same", "beat_last"]) + ), + # Must define this as both heuristic RLMs will throw an error, if their + # `forward_train` is called. + policies_to_train=["learned"], + ) + .training( + model={ + "use_lstm": args.use_lstm, + # Use a simpler FCNet when we also have an LSTM. + "fcnet_hiddens": [32] if args.use_lstm else [256, 256], + "lstm_cell_size": 256, + "max_seq_len": 15, + "vf_share_layers": True, + }, + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "always_same": SingleAgentRLModuleSpec( + module_class=AlwaysSameHeuristicRLM, + observation_space=gym.spaces.Discrete(4), + action_space=gym.spaces.Discrete(3), + ), + "beat_last": SingleAgentRLModuleSpec( + module_class=BeatLastHeuristicRLM, + observation_space=gym.spaces.Discrete(4), + action_space=gym.spaces.Discrete(3), + ), + "learned": SingleAgentRLModuleSpec(), + } + ) + ) + ) + + # Make `args.stop_reward` "point" to the reward of the learned policy. + stop = { + "training_iteration": args.stop_iters, + "sampler_results/policy_reward_mean/learned": args.stop_reward, + "timesteps_total": args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + success_metric="sampler_results/policy_reward_mean/learned", + ) diff --git a/rllib/examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt.py b/rllib/examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt.py new file mode 100644 index 0000000000000..9469b31d3e411 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/rock_paper_scissors_learnt_vs_learnt.py @@ -0,0 +1,96 @@ +"""A simple multi-agent env with two agents play rock paper scissors. + +This demonstrates running two learning policies in competition, both using the same +RLlib algorithm (PPO by default). + +The combined reward as well as individual rewards should roughly remain at 0.0 as no +policy should - in the long run - be able to learn a better strategy than chosing +actions at random. However, it could be possible that - for some time - one or the other +policy can exploit a "stochastic weakness" of the opponent policy. For example a policy +`A` learns that its opponent `B` has learnt to choose "paper" more often, which in +return makes `A` choose "scissors" more often as a countermeasure. +""" +import re + +from pettingzoo.classic import rps_v2 + +from ray.rllib.connectors.env_to_module import ( + AddObservationsFromEpisodesToBatch, + FlattenObservations, + WriteObservationsToEpisodes, +) +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=50, + default_timesteps=200000, + default_reward=6.0, +) +parser.add_argument( + "--use-lstm", + action="store_true", + help="Whether to use an LSTM wrapped module instead of a simple MLP one. With LSTM " + "the reward diff can reach 7.0, without only 5.0.", +) + + +register_env( + "RockPaperScissors", + lambda _: ParallelPettingZooEnv(rps_v2.parallel_env()), +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("RockPaperScissors") + .rollouts( + env_to_module_connector=lambda env: ( + AddObservationsFromEpisodesToBatch(), + FlattenObservations(multi_agent=True), + WriteObservationsToEpisodes(), + ), + ) + .multi_agent( + policies={"p0", "p1"}, + # `player_0` uses `p0`, `player_1` uses `p1`. + policy_mapping_fn=lambda aid, episode: re.sub("^player_", "p", aid), + ) + .training( + model={ + "use_lstm": args.use_lstm, + # Use a simpler FCNet when we also have an LSTM. + "fcnet_hiddens": [32] if args.use_lstm else [256, 256], + "lstm_cell_size": 256, + "max_seq_len": 15, + "vf_share_layers": True, + }, + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "p0": SingleAgentRLModuleSpec(), + "p1": SingleAgentRLModuleSpec(), + } + ) + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel.py new file mode 100644 index 0000000000000..eb0098f52b9c1 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/self_play_league_based_with_open_spiel.py @@ -0,0 +1,288 @@ +"""Example showing how to implement a league-based training workflow. + +Uses the open spiel adapter of RLlib with the "markov_soccer" game and +a simplified multi-agent, league-based setup: +https://deepmind.com/blog/article/AlphaStar-Grandmaster-level-in- \ +StarCraft-II-using-multi-agent-reinforcement-learning + +Our league consists of three groups of policies: +- main policies: The current main policy plus prior versions of it. +- main exploiters: Trained by playing only against different "main policies". +- league exploiters: Trained by playing against any policy in the league. + +We start with 1 policy from each group, setting all 3 of these to an initial +PPO policy and allowing all 3 policies to be trained. +After each train update - via our custom callback - we decide for each +trainable policy, whether to make a copy and freeze it. Frozen policies +will not be altered anymore. However, they remain in the league for +future matches against trainable policies. +Matchmaking happens via a policy_mapping_fn, which needs to be altered +after every change (addition) to the league. The mapping function +randomly maps agents in a way, such that: +- Frozen main exploiters play against the one (currently trainable) main + policy. +- Trainable main exploiters play against any main policy (including already + frozen main policies). +- Frozen league exploiters play against any trainable policy in the league. +- Trainable league exploiters play against any policy in the league. + +After training for n iterations, a configurable number of episodes can +be played by the user against the "main" agent on the command line. +""" +import functools + +import numpy as np + +import ray +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner +from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel +from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv +from ray.rllib.examples.multi_agent_and_self_play.utils import ( + ask_user_for_action, + SelfPlayLeagueBasedCallback, + SelfPlayLeagueBasedCallbackOldAPIStack, +) +from ray.rllib.examples.policy.random_policy import RandomPolicy +from ray.rllib.examples.rl_module.random_rl_module import RandomRLModule +from ray.rllib.policy.policy import PolicySpec +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +open_spiel = try_import_open_spiel(error=True) +pyspiel = try_import_pyspiel(error=True) + +# Import after try_import_open_spiel, so we can error out with hints +from open_spiel.python.rl_environment import Environment # noqa: E402 + + +parser = add_rllib_example_script_args(default_timesteps=2000000) +parser.add_argument( + "--env", + type=str, + default="markov_soccer", + choices=["markov_soccer", "connect_four"], +) +parser.add_argument( + "--win-rate-threshold", + type=float, + default=0.85, + help="Win-rate at which we setup another opponent by freezing the " + "current main policy and playing against a uniform distribution " + "of previously frozen 'main's from here on.", +) +parser.add_argument( + "--min-league-size", + type=float, + default=8, + help="Minimum number of policies/RLModules to consider the test passed. " + "The initial league size is 2: `main` and `random`. " + "`--min-league-size=3` thus means that one new policy/RLModule has been " + "added so far (b/c the `main` one has reached the `--win-rate-threshold " + "against the `random` Policy/RLModule).", +) +parser.add_argument( + "--num-episodes-human-play", + type=int, + default=0, + help="How many episodes to play against the user on the command " + "line after training has finished.", +) +parser.add_argument( + "--from-checkpoint", + type=str, + default=None, + help="Full path to a checkpoint file for restoring a previously saved " + "Algorithm state.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + register_env( + "open_spiel_env", + lambda _: OpenSpielEnv(pyspiel.load_game(args.env)), + ) + + def policy_mapping_fn(agent_id, episode, worker=None, **kwargs): + # At first, only have main play against the random main exploiter. + return "main" if episode.episode_id % 2 == agent_id else "main_exploiter_0" + + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + # At first, only have main play against the random main exploiter. + return "main" if hash(episode.id_) % 2 == agent_id else "main_exploiter_0" + + def _get_multi_agent(): + names = { + # Our main policy, we'd like to optimize. + "main", + # First frozen version of main (after we reach n% win-rate). + "main_0", + # Initial main exploiters (one random, one trainable). + "main_exploiter_0", + "main_exploiter_1", + # Initial league exploiters (one random, one trainable). + "league_exploiter_0", + "league_exploiter_1", + } + if args.enable_new_api_stack: + policies = names + spec = { + mid: SingleAgentRLModuleSpec( + module_class=( + RandomRLModule + if mid in ["main_exploiter_0", "league_exploiter_0"] + else None + ) + ) + for mid in names + } + else: + policies = { + mid: PolicySpec( + policy_class=( + RandomPolicy + if mid in ["main_exploiter_0", "league_exploiter_0"] + else None + ) + ) + for mid in names + } + spec = None + return {"policies": policies, "spec": spec} + + config = ( + get_trainable_cls(args.algo) + .get_default_config() + # Use new API stack ... + .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .environment("open_spiel_env") + .framework(args.framework) + # Set up the main piece in this experiment: The league-bases self-play + # callback, which controls adding new policies/Modules to the league and + # properly matching the different policies in the league with each other. + .callbacks( + functools.partial( + SelfPlayLeagueBasedCallback + if args.enable_new_api_stack + else SelfPlayLeagueBasedCallbackOldAPIStack, + win_rate_threshold=args.win_rate_threshold, + ) + ) + .rollouts( + num_rollout_workers=args.num_env_runners, + num_envs_per_worker=1 if args.enable_new_api_stack else 5, + # Set up the correct env-runner to use depending on + # old-stack/new-stack and multi-agent settings. + env_runner_cls=( + None if not args.enable_new_api_stack else MultiAgentEnvRunner + ), + ) + .resources( + num_learner_workers=args.num_gpus, + num_gpus_per_learner_worker=1 if args.num_gpus else 0, + num_cpus_for_local_worker=1, + ) + .training( + num_sgd_iter=20, + model=dict( + **({"uses_new_env_runners": True} if args.enable_new_api_stack else {}), + ), + ) + .multi_agent( + # Initial policy map: All PPO. This will be expanded + # to more policy snapshots. This is done in the + # custom callback defined above (`LeagueBasedSelfPlayCallback`). + policies=_get_multi_agent()["policies"], + policy_mapping_fn=( + agent_to_module_mapping_fn + if args.enable_new_api_stack + else policy_mapping_fn + ), + # At first, only train main_0 (until good enough to win against + # random). + policies_to_train=["main"], + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs=_get_multi_agent()["spec"] + ), + ) + ) + + # Run everything as configured. + # Train the "main" policy to play really well using self-play. + results = None + if not args.from_checkpoint: + stop = { + "timesteps_total": args.stop_timesteps, + "training_iteration": args.stop_iters, + "league_size": args.min_league_size, + } + results = run_rllib_example_script_experiment(config, args, stop=stop) + + # Restore trained Algorithm (set to non-explore behavior) and play against + # human on command line. + if args.num_episodes_human_play > 0: + num_episodes = 0 + # Switch off exploration for better inference performance. + config.explore = False + algo = config.build() + if args.from_checkpoint: + algo.restore(args.from_checkpoint) + else: + checkpoint = results.get_best_result().checkpoint + if not checkpoint: + raise ValueError("No last checkpoint found in results!") + algo.restore(checkpoint) + + # Play from the command line against the trained agent + # in an actual (non-RLlib-wrapped) open-spiel env. + human_player = 1 + env = Environment(args.env) + + while num_episodes < args.num_episodes_human_play: + print("You play as {}".format("o" if human_player else "x")) + time_step = env.reset() + while not time_step.last(): + player_id = time_step.observations["current_player"] + if player_id == human_player: + action = ask_user_for_action(time_step) + else: + obs = np.array(time_step.observations["info_state"][player_id]) + if config.uses_new_env_runners: + action = algo.workers.local_worker().module.forward_inference( + {"obs": obs} + ) + else: + action = algo.compute_single_action(obs, policy_id="main") + # In case computer chooses an invalid action, pick a + # random one. + legal = time_step.observations["legal_actions"][player_id] + if action not in legal: + action = np.random.choice(legal) + time_step = env.step([action]) + print(f"\n{env.get_state}") + + print(f"\n{env.get_state}") + + print("End of game!") + if time_step.rewards[human_player] > 0: + print("You win") + elif time_step.rewards[human_player] < 0: + print("You lose") + else: + print("Draw") + # Switch order of players + human_player = 1 - human_player + + num_episodes += 1 + + algo.stop() + + ray.shutdown() diff --git a/rllib/examples/multi_agent_and_self_play/self_play_with_open_spiel.py b/rllib/examples/multi_agent_and_self_play/self_play_with_open_spiel.py new file mode 100644 index 0000000000000..cc080082dae7d --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/self_play_with_open_spiel.py @@ -0,0 +1,241 @@ +"""Example showing how one can implement a simple self-play training workflow. + +Uses the open spiel adapter of RLlib with the "connect_four" game and +a multi-agent setup with a "main" policy and n "main_v[x]" policies +(x=version number), which are all at-some-point-frozen copies of +"main". At the very beginning, "main" plays against RandomPolicy. + +Checks for the training progress after each training update via a custom +callback. We simply measure the win rate of "main" vs the opponent +("main_v[x]" or RandomPolicy at the beginning) by looking through the +achieved rewards in the episodes in the train batch. If this win rate +reaches some configurable threshold, we add a new policy to +the policy map (a frozen copy of the current "main" one) and change the +policy_mapping_fn to make new matches of "main" vs any of the previous +versions of "main" (including the just added one). + +After training for n iterations, a configurable number of episodes can +be played by the user against the "main" agent on the command line. +""" + +import functools + +import numpy as np + +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner +from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel +from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv +from ray.rllib.examples.rl_module.random_rl_module import RandomRLModule +from ray.rllib.examples.multi_agent_and_self_play.utils import ( + ask_user_for_action, + SelfPlayCallback, + SelfPlayCallbackOldAPIStack, +) +from ray.rllib.examples.policy.random_policy import RandomPolicy +from ray.rllib.policy.policy import PolicySpec +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +open_spiel = try_import_open_spiel(error=True) +pyspiel = try_import_pyspiel(error=True) + +# Import after try_import_open_spiel, so we can error out with hints. +from open_spiel.python.rl_environment import Environment # noqa: E402 + + +parser = add_rllib_example_script_args(default_timesteps=2000000) +parser.add_argument( + "--env", + type=str, + default="connect_four", + choices=["markov_soccer", "connect_four"], +) +parser.add_argument( + "--win-rate-threshold", + type=float, + default=0.95, + help="Win-rate at which we setup another opponent by freezing the " + "current main policy and playing against a uniform distribution " + "of previously frozen 'main's from here on.", +) +parser.add_argument( + "--min-league-size", + type=float, + default=3, + help="Minimum number of policies/RLModules to consider the test passed. " + "The initial league size is 2: `main` and `random`. " + "`--min-league-size=3` thus means that one new policy/RLModule has been " + "added so far (b/c the `main` one has reached the `--win-rate-threshold " + "against the `random` Policy/RLModule).", +) +parser.add_argument( + "--num-episodes-human-play", + type=int, + default=10, + help="How many episodes to play against the user on the command " + "line after training has finished.", +) +parser.add_argument( + "--from-checkpoint", + type=str, + default=None, + help="Full path to a checkpoint file for restoring a previously saved " + "Algorithm state.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + register_env("open_spiel_env", lambda _: OpenSpielEnv(pyspiel.load_game(args.env))) + + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + # agent_id = [0|1] -> module depends on episode ID + # This way, we make sure that both modules sometimes play agent0 + # (start player) and sometimes agent1 (player to move 2nd). + return "main" if hash(episode.id_) % 2 == agent_id else "random" + + def policy_mapping_fn(agent_id, episode, worker, **kwargs): + return "main" if episode.episode_id % 2 == agent_id else "random" + + config = ( + get_trainable_cls(args.algo) + .get_default_config() + .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .environment("open_spiel_env") + .framework(args.framework) + # Set up the main piece in this experiment: The league-bases self-play + # callback, which controls adding new policies/Modules to the league and + # properly matching the different policies in the league with each other. + .callbacks( + functools.partial( + SelfPlayCallback + if args.enable_new_api_stack + else SelfPlayCallbackOldAPIStack, + win_rate_threshold=args.win_rate_threshold, + ) + ) + .rollouts( + num_rollout_workers=args.num_env_runners, + num_envs_per_worker=1 if args.enable_new_api_stack else 5, + # Set up the correct env-runner to use depending on + # old-stack/new-stack and multi-agent settings. + env_runner_cls=( + None if not args.enable_new_api_stack else MultiAgentEnvRunner + ), + ) + .resources( + num_learner_workers=args.num_gpus, + num_gpus_per_learner_worker=1 if args.num_gpus else 0, + num_cpus_for_local_worker=1, + ) + .training(model={"fcnet_hiddens": [512, 512]}) + .multi_agent( + # Initial policy map: Random and default algo one. This will be expanded + # to more policy snapshots taken from "main" against which "main" + # will then play (instead of "random"). This is done in the + # custom callback defined above (`SelfPlayCallback`). + policies=( + { + # Our main policy, we'd like to optimize. + "main": PolicySpec(), + # An initial random opponent to play against. + "random": PolicySpec(policy_class=RandomPolicy), + } + if not args.enable_new_api_stack + else {"main", "random"} + ), + # Assign agent 0 and 1 randomly to the "main" policy or + # to the opponent ("random" at first). Make sure (via episode_id) + # that "main" always plays against "random" (and not against + # another "main"). + policy_mapping_fn=( + agent_to_module_mapping_fn + if args.enable_new_api_stack + else policy_mapping_fn + ), + # Always just train the "main" policy. + policies_to_train=["main"], + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "main": SingleAgentRLModuleSpec(), + "random": SingleAgentRLModuleSpec(module_class=RandomRLModule), + } + ), + ) + ) + + # Only for PPO, change the `num_sgd_iter` setting. + if args.algo == "PPO": + config.training(num_sgd_iter=20) + + stop = { + "timesteps_total": args.stop_timesteps, + "training_iteration": args.stop_iters, + "league_size": args.min_league_size, + } + + # Train the "main" policy to play really well using self-play. + results = None + if not args.from_checkpoint: + results = run_rllib_example_script_experiment(config, args, stop=stop) + + # Restore trained Algorithm (set to non-explore behavior) and play against + # human on command line. + if args.num_episodes_human_play > 0: + num_episodes = 0 + config.explore = False + algo = config.build() + if args.from_checkpoint: + algo.restore(args.from_checkpoint) + else: + checkpoint = results.get_best_result().checkpoint + if not checkpoint: + raise ValueError("No last checkpoint found in results!") + algo.restore(checkpoint) + + # Play from the command line against the trained agent + # in an actual (non-RLlib-wrapped) open-spiel env. + human_player = 1 + env = Environment(args.env) + + while num_episodes < args.num_episodes_human_play: + print("You play as {}".format("o" if human_player else "x")) + time_step = env.reset() + while not time_step.last(): + player_id = time_step.observations["current_player"] + if player_id == human_player: + action = ask_user_for_action(time_step) + else: + obs = np.array(time_step.observations["info_state"][player_id]) + action = algo.compute_single_action(obs, policy_id="main") + # In case computer chooses an invalid action, pick a + # random one. + legal = time_step.observations["legal_actions"][player_id] + if action not in legal: + action = np.random.choice(legal) + time_step = env.step([action]) + print(f"\n{env.get_state}") + + print(f"\n{env.get_state}") + + print("End of game!") + if time_step.rewards[human_player] > 0: + print("You win") + elif time_step.rewards[human_player] < 0: + print("You lose") + else: + print("Draw") + # Switch order of players. + human_player = 1 - human_player + + num_episodes += 1 + + algo.stop() diff --git a/rllib/examples/multi_agent_and_self_play/two_step_game_with_grouped_agents.py b/rllib/examples/multi_agent_and_self_play/two_step_game_with_grouped_agents.py new file mode 100644 index 0000000000000..1029625e161a4 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/two_step_game_with_grouped_agents.py @@ -0,0 +1,98 @@ +"""The two-step game from the QMIX paper: +https://arxiv.org/pdf/1803.11485.pdf + +See also: rllib/examples/centralized_critic.py for centralized critic PPO on this game. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Note that in this script, we use an multi-agent environment in which both +agents that normally play this game have been merged into one agent with ID +"agents" and observation- and action-spaces being 2-tupled (1 item for each +agent). The "agents" agent is mapped to the policy with ID "p0". + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +Which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should expect a reward of 8.0 (the max to reach in thie game) eventually +being achieved by a simple PPO policy (no tuning, just using RLlib's default settings): + ++---------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|---------------------------------+------------+-----------------+--------+ +| PPO_grouped_twostep_4354b_00000 | TERMINATED | 127.0.0.1:42602 | 20 | ++---------------------------------+------------+-----------------+--------+ + ++------------------+-------+-------------------+-------------+ +| total time (s) | ts | combined reward | reward p0 | ++------------------+-------+-------------------+-------------| +| 87.5756 | 80000 | 8 | 8 | ++------------------+-------+-------------------+-------------+ +""" + +from ray.rllib.connectors.env_to_module import ( + AddObservationsFromEpisodesToBatch, + FlattenObservations, + WriteObservationsToEpisodes, +) +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.examples.env.two_step_game import TwoStepGameWithGroupedAgents +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import register_env, get_trainable_cls + + +parser = add_rllib_example_script_args(default_reward=7.0) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + register_env( + "grouped_twostep", + lambda config: TwoStepGameWithGroupedAgents(config), + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("grouped_twostep") + .rollouts( + env_to_module_connector=lambda env: ( + AddObservationsFromEpisodesToBatch(), + FlattenObservations(multi_agent=True), + WriteObservationsToEpisodes(), + ), + ) + .multi_agent( + policies={"p0"}, + policy_mapping_fn=lambda aid, *a, **kw: "p0", + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "p0": SingleAgentRLModuleSpec(), + }, + ) + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/multi_agent_and_self_play/utils.py b/rllib/examples/multi_agent_and_self_play/utils.py deleted file mode 100644 index 2cf1c859a21bd..0000000000000 --- a/rllib/examples/multi_agent_and_self_play/utils.py +++ /dev/null @@ -1,23 +0,0 @@ -import sys - - -def ask_user_for_action(time_step): - """Asks the user for a valid action on the command line and returns it. - - Re-queries the user until she picks a valid one. - - Args: - time_step: The open spiel Environment time-step object. - """ - pid = time_step.observations["current_player"] - legal_moves = time_step.observations["legal_actions"][pid] - choice = -1 - while choice not in legal_moves: - print("Choose an action from {}:".format(legal_moves)) - sys.stdout.flush() - choice_str = input() - try: - choice = int(choice_str) - except ValueError: - continue - return choice diff --git a/rllib/examples/multi_agent_and_self_play/utils/__init__.py b/rllib/examples/multi_agent_and_self_play/utils/__init__.py new file mode 100644 index 0000000000000..40874aa145430 --- /dev/null +++ b/rllib/examples/multi_agent_and_self_play/utils/__init__.py @@ -0,0 +1,45 @@ +import sys + +from ray.rllib.examples.multi_agent_and_self_play.utils.self_play_callback import ( + SelfPlayCallback, +) +from ray.rllib.examples.multi_agent_and_self_play.utils.self_play_league_based_callback import ( # noqa + SelfPlayLeagueBasedCallback, +) +from ray.rllib.examples.multi_agent_and_self_play.utils.self_play_callback_old_api_stack import ( # noqa + SelfPlayCallbackOldAPIStack, +) +from ray.rllib.examples.multi_agent_and_self_play.utils.self_play_league_based_callback_old_api_stack import ( # noqa + SelfPlayLeagueBasedCallbackOldAPIStack, +) + + +def ask_user_for_action(time_step): + """Asks the user for a valid action on the command line and returns it. + + Re-queries the user until she picks a valid one. + + Args: + time_step: The open spiel Environment time-step object. + """ + pid = time_step.observations["current_player"] + legal_moves = time_step.observations["legal_actions"][pid] + choice = -1 + while choice not in legal_moves: + print("Choose an action from {}:".format(legal_moves)) + sys.stdout.flush() + choice_str = input() + try: + choice = int(choice_str) + except ValueError: + continue + return choice + + +__all__ = [ + "ask_user_for_action", + "SelfPlayCallback", + "SelfPlayLeagueBasedCallback", + "SelfPlayCallbackOldAPIStack", + "SelfPlayLeagueBasedCallbackOldAPIStack", +] diff --git a/rllib/examples/multi_agent_and_self_play/self_play_callback.py b/rllib/examples/multi_agent_and_self_play/utils/self_play_callback.py similarity index 100% rename from rllib/examples/multi_agent_and_self_play/self_play_callback.py rename to rllib/examples/multi_agent_and_self_play/utils/self_play_callback.py diff --git a/rllib/examples/multi_agent_and_self_play/self_play_callback_old_api_stack.py b/rllib/examples/multi_agent_and_self_play/utils/self_play_callback_old_api_stack.py similarity index 100% rename from rllib/examples/multi_agent_and_self_play/self_play_callback_old_api_stack.py rename to rllib/examples/multi_agent_and_self_play/utils/self_play_callback_old_api_stack.py diff --git a/rllib/examples/multi_agent_and_self_play/self_play_league_based_callback.py b/rllib/examples/multi_agent_and_self_play/utils/self_play_league_based_callback.py similarity index 100% rename from rllib/examples/multi_agent_and_self_play/self_play_league_based_callback.py rename to rllib/examples/multi_agent_and_self_play/utils/self_play_league_based_callback.py diff --git a/rllib/examples/multi_agent_and_self_play/self_play_league_based_callback_old_api_stack.py b/rllib/examples/multi_agent_and_self_play/utils/self_play_league_based_callback_old_api_stack.py similarity index 100% rename from rllib/examples/multi_agent_and_self_play/self_play_league_based_callback_old_api_stack.py rename to rllib/examples/multi_agent_and_self_play/utils/self_play_league_based_callback_old_api_stack.py diff --git a/rllib/examples/multi_agent_cartpole.py b/rllib/examples/multi_agent_cartpole.py index b326772e59ceb..043f5317badfd 100644 --- a/rllib/examples/multi_agent_cartpole.py +++ b/rllib/examples/multi_agent_cartpole.py @@ -1,128 +1,6 @@ -"""Simple example of setting up a multi-agent policy mapping. - -Control the number of agents and policies via --num-agents and --num-policies. - -This works with hundreds of agents and policies, but note that initializing -many TF policies will take some time. - -Also, TF evals might slow down with large numbers of policies. To debug TF -execution, set the TF_TIMELINE_DIR environment variable. +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.multi_agent_cartpole.py` """ -import argparse -import os -import random - -import ray -from ray import air, tune -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.examples.env.multi_agent import MultiAgentCartPole -from ray.rllib.examples.models.shared_weights_model import ( - SharedWeightsModel1, - SharedWeightsModel2, - TF2SharedWeightsModel, - TorchSharedWeightsModel, -) -from ray.rllib.models import ModelCatalog -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import check_learning_achieved - -tf1, tf, tfv = try_import_tf() - -parser = argparse.ArgumentParser() - -parser.add_argument("--num-agents", type=int, default=4) -parser.add_argument("--num-policies", type=int, default=2) -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=200, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." -) - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init(num_cpus=args.num_cpus or None) - - # Register the models to use. - if args.framework == "torch": - mod1 = mod2 = TorchSharedWeightsModel - elif args.framework == "tf2": - mod1 = mod2 = TF2SharedWeightsModel - else: - mod1 = SharedWeightsModel1 - mod2 = SharedWeightsModel2 - ModelCatalog.register_custom_model("model1", mod1) - ModelCatalog.register_custom_model("model2", mod2) - - # Each policy can have a different configuration (including custom model). - def gen_policy(i): - - # TODO (sven): Uncomment this when we move this example to the new API stack. - # if bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)): - # # just change the gammas between the two policies. - # # changing the module is not a critical part of this example. - # # the important part is that the policies are different. - # config = { - # "gamma": random.choice([0.95, 0.99]), - # } - # else: - config = PPOConfig.overrides( - model={ - "custom_model": ["model1", "model2"][i % 2], - }, - gamma=random.choice([0.95, 0.99]), - ) - return PolicySpec(config=config) - - # Setup PPO with an ensemble of `num_policies` different policies. - policies = {"policy_{}".format(i): gen_policy(i) for i in range(args.num_policies)} - policy_ids = list(policies.keys()) - - def policy_mapping_fn(agent_id, episode, worker, **kwargs): - pol_id = random.choice(policy_ids) - return pol_id - - config = ( - PPOConfig() - .environment(MultiAgentCartPole, env_config={"num_agents": args.num_agents}) - .framework(args.framework) - .training(num_sgd_iter=10) - .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - "episode_reward_mean": args.stop_reward, - "timesteps_total": args.stop_timesteps, - "training_iteration": args.stop_iters, - } - - results = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=1), - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/multi_agent_custom_policy.py b/rllib/examples/multi_agent_custom_policy.py index bac808a92419a..168bb39c8130d 100644 --- a/rllib/examples/multi_agent_custom_policy.py +++ b/rllib/examples/multi_agent_custom_policy.py @@ -1,117 +1,6 @@ -"""Example of running a custom hand-coded policy alongside trainable policies. - -This example has two policies: - (1) a simple simple policy trained with PPO optimizer - (2) a hand-coded policy that acts at random in the env (doesn't learn) - -In the console output, you can see the PPO policy does much better than random: -Result for PPO_multi_cartpole_0: - ... - policy_reward_mean: - learnable_policy: 185.23 - random: 21.255 - ... +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.custom_heuristic_rl_module.py` """ -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.examples.env.multi_agent import MultiAgentCartPole -from ray.rllib.examples.policy.random_policy import RandomPolicy -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import register_env - -# The new RLModule / Learner API -from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec -from ray.rllib.examples.rl_module.random_rl_module import RandomRLModule - -parser = argparse.ArgumentParser() -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=20, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." -) - -if __name__ == "__main__": - args = parser.parse_args() - ray.init() - - # Simple environment with 4 independent cartpole entities - register_env( - "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4}) - ) - - config = ( - PPOConfig() - .environment("multi_agent_cartpole") - .framework(args.framework) - .multi_agent( - # The multiagent Policy map. - policies={ - # The Policy we are actually learning. - "learnable_policy": PolicySpec( - config=PPOConfig.overrides(framework_str=args.framework) - ), - # Random policy we are playing against. - "random": PolicySpec(policy_class=RandomPolicy), - }, - # Map to either random behavior or PR learning behavior based on - # the agent's ID. - policy_mapping_fn=lambda agent_id, *args, **kwargs: [ - "learnable_policy", - "random", - ][agent_id % 2], - # We wouldn't have to specify this here as the RandomPolicy does - # not learn anyways (it has an empty `learn_on_batch` method), but - # it's good practice to define this list here either way. - policies_to_train=["learnable_policy"], - ) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - .rl_module( - rl_module_spec=MultiAgentRLModuleSpec( - module_specs={ - "learnable_policy": SingleAgentRLModuleSpec(), - "random": SingleAgentRLModuleSpec(module_class=RandomRLModule), - } - ), - ) - ) - - stop = { - "training_iteration": args.stop_iters, - "episode_reward_mean": args.stop_reward, - "timesteps_total": args.stop_timesteps, - } - - results = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=1), - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/multi_agent_different_spaces_for_agents.py b/rllib/examples/multi_agent_different_spaces_for_agents.py index da62c4f216027..3774ce60d8bf8 100644 --- a/rllib/examples/multi_agent_different_spaces_for_agents.py +++ b/rllib/examples/multi_agent_different_spaces_for_agents.py @@ -1,165 +1,6 @@ +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.different_spaces_for_agents.py` """ -Example showing how one can create a multi-agent env, in which the different agents -have different observation and action spaces. -These spaces do NOT necessarily have to be specified manually by the user. Instead, -RLlib will try to automatically infer them from the env provided spaces dicts -(agentID -> obs/act space) and the policy mapping fn (mapping agent IDs to policy IDs). ---- -Run this example with defaults (using Tune): - - $ python multi_agent_different_spaces_for_agents.py -""" - -import argparse -import gymnasium as gym -import os - -import ray -from ray import air, tune -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.tune.registry import get_trainable_cls - - -class BasicMultiAgentMultiSpaces(MultiAgentEnv): - """A simple multi-agent example environment where agents have different spaces. - - agent0: obs=(10,), act=Discrete(2) - agent1: obs=(20,), act=Discrete(3) - - The logic of the env doesn't really matter for this example. The point of this env - is to show how one can use multi-agent envs, in which the different agents utilize - different obs- and action spaces. - """ - - def __init__(self, config=None): - self.agents = {"agent0", "agent1"} - self._agent_ids = set(self.agents) - - self.terminateds = set() - self.truncateds = set() - - # Provide full (preferred format) observation- and action-spaces as Dicts - # mapping agent IDs to the individual agents' spaces. - self._obs_space_in_preferred_format = True - self.observation_space = gym.spaces.Dict( - { - "agent0": gym.spaces.Box(low=-1.0, high=1.0, shape=(10,)), - "agent1": gym.spaces.Box(low=-1.0, high=1.0, shape=(20,)), - } - ) - self._action_space_in_preferred_format = True - self.action_space = gym.spaces.Dict( - {"agent0": gym.spaces.Discrete(2), "agent1": gym.spaces.Discrete(3)} - ) - - super().__init__() - - def reset(self, *, seed=None, options=None): - self.terminateds = set() - self.truncateds = set() - return {i: self.observation_space[i].sample() for i in self.agents}, {} - - def step(self, action_dict): - obs, rew, terminated, truncated, info = {}, {}, {}, {}, {} - for i, action in action_dict.items(): - obs[i] = self.observation_space[i].sample() - rew[i] = 0.0 - terminated[i] = False - truncated[i] = False - info[i] = {} - terminated["__all__"] = len(self.terminateds) == len(self.agents) - truncated["__all__"] = len(self.truncateds) == len(self.agents) - return obs, rew, terminated, truncated, info - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - - # general args - parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." - ) - parser.add_argument("--num-cpus", type=int, default=0) - parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - parser.add_argument( - "--stop-iters", type=int, default=10, help="Number of iterations to train." - ) - parser.add_argument( - "--stop-timesteps", - type=int, - default=10000, - help="Number of timesteps to train.", - ) - parser.add_argument( - "--stop-reward", - type=float, - default=80.0, - help="Reward at which we stop training.", - ) - parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", - ) - - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - config = ( - get_trainable_cls(args.run) - .get_default_config() - .environment(env=BasicMultiAgentMultiSpaces) - .resources( - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), - ) - .training(train_batch_size=1024) - .rollouts(num_rollout_workers=1, rollout_fragment_length="auto") - .framework(args.framework) - .multi_agent( - # Use a simple set of policy IDs. Spaces for the individual policies - # will be inferred automatically using reverse lookup via the - # `policy_mapping_fn` and the env provided spaces for the different - # agents. Alternatively, you could use: - # policies: {main0: PolicySpec(...), main1: PolicySpec} - policies={"main0", "main1"}, - # Simple mapping fn, mapping agent0 to main0 and agent1 to main1. - policy_mapping_fn=(lambda aid, episode, worker, **kw: f"main{aid[-1]}"), - # Only train main0. - policies_to_train=["main0"], - ) - ) - - results = tune.Tuner( - args.run, - run_config=air.RunConfig( - stop=stop, - ), - param_space=config, - ).fit() - - if not results: - raise ValueError( - "No results returned from tune.run(). Something must have gone wrong." - ) - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/multi_agent_independent_learning.py b/rllib/examples/multi_agent_independent_learning.py index 2388935573fdd..99cb059c92901 100644 --- a/rllib/examples/multi_agent_independent_learning.py +++ b/rllib/examples/multi_agent_independent_learning.py @@ -1,60 +1,6 @@ -import argparse +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.independent_learning.py` +""" -from ray import air, tune -from ray.tune.registry import register_env -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv -from pettingzoo.sisl import waterworld_v4 - -# Based on code from github.com/parametersharingmadrl/parametersharingmadrl - -parser = argparse.ArgumentParser() -parser.add_argument( - "--num-gpus", - type=int, - default=1, - help="Number of GPUs to use for training.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: Only one episode will be " - "sampled.", -) - -if __name__ == "__main__": - args = parser.parse_args() - - def env_creator(args): - return PettingZooEnv(waterworld_v4.env()) - - env = env_creator({}) - register_env("waterworld", env_creator) - - config = ( - PPOConfig() - .environment("waterworld") - .resources(num_gpus=args.num_gpus) - .rollouts(num_rollout_workers=2) - .multi_agent( - policies=env.get_agent_ids(), - policy_mapping_fn=(lambda agent_id, *args, **kwargs: agent_id), - ) - ) - - if args.as_test: - # Only a compilation test of running waterworld / independent learning. - stop = {"training_iteration": 1} - else: - stop = {"episodes_total": 60000} - - tune.Tuner( - "PPO", - run_config=air.RunConfig( - stop=stop, - checkpoint_config=air.CheckpointConfig( - checkpoint_frequency=10, - ), - ), - param_space=config, - ).fit() +raise NotImplementedError(msg) diff --git a/rllib/examples/multi_agent_parameter_sharing.py b/rllib/examples/multi_agent_parameter_sharing.py index 676b48670dfec..cd1a50f77e12c 100644 --- a/rllib/examples/multi_agent_parameter_sharing.py +++ b/rllib/examples/multi_agent_parameter_sharing.py @@ -1,55 +1,6 @@ -from ray import air, tune -from ray.tune.registry import register_env -from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv -from pettingzoo.sisl import waterworld_v4 +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.parameter_sharing.py` +""" -# TODO (Kourosh): Noticed that the env is broken and throws an error in this test. -# The error is ValueError: Input vector should be 1-D. (Could be pettingzoo version -# issue) -# Based on code from github.com/parametersharingmadrl/parametersharingmadrl - -if __name__ == "__main__": - # RDQN - Rainbow DQN - # ADQN - Apex DQN - - register_env("waterworld", lambda _: PettingZooEnv(waterworld_v4.env())) - - tune.Tuner( - "APEX_DDPG", - run_config=air.RunConfig( - stop={"episodes_total": 60000}, - checkpoint_config=air.CheckpointConfig( - checkpoint_frequency=10, - ), - ), - param_space={ - # Enviroment specific. - "env": "waterworld", - # General - "num_gpus": 1, - "num_workers": 2, - "num_envs_per_worker": 8, - "replay_buffer_config": { - "capacity": int(1e5), - "prioritized_replay_alpha": 0.5, - }, - "num_steps_sampled_before_learning_starts": 1000, - "compress_observations": True, - "rollout_fragment_length": 20, - "train_batch_size": 512, - "gamma": 0.99, - "n_step": 3, - "lr": 0.0001, - "target_network_update_freq": 50000, - "min_sample_timesteps_per_iteration": 25000, - # Method specific. - # We only have one policy (calling it "shared"). - # Class, obs/act-spaces, and config will be derived - # automatically. - "policies": {"shared_policy"}, - # Always use "shared" policy. - "policy_mapping_fn": ( - lambda agent_id, episode, worker, **kwargs: "shared_policy" - ), - }, - ).fit() +raise NotImplementedError(msg) diff --git a/rllib/examples/nested_action_spaces.py b/rllib/examples/nested_action_spaces.py index 92818bd460e2e..fe637ba135cbc 100644 --- a/rllib/examples/nested_action_spaces.py +++ b/rllib/examples/nested_action_spaces.py @@ -1,114 +1,6 @@ -from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete -import os +msg = """ +This script has been moved to +`ray.rllib.examples.connectors.connector_v2_nested_action_spaces.py` +""" -from ray.tune.registry import register_env -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner -from ray.rllib.examples.env.multi_agent import MultiAgentNestedSpaceRepeatAfterMeEnv -from ray.rllib.examples.env.nested_space_repeat_after_me_env import ( - NestedSpaceRepeatAfterMeEnv, -) -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls - - -# Read in common example script command line arguments. -parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0) - - -if __name__ == "__main__": - args = parser.parse_args() - - # Define env-to-module-connector pipeline for the new stack. - def _env_to_module_pipeline(env): - return [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), - ] - - # Register our environment with tune. - if args.num_agents > 0: - register_env( - "env", - lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv( - config=dict(c, **{"num_agents": args.num_agents}) - ), - ) - else: - register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c)) - - # Define the AlgorithmConfig used. - config = ( - get_trainable_cls(args.algo) - .get_default_config() - # Use new API stack for PPO only. - .experimental(_enable_new_api_stack=args.enable_new_api_stack) - .environment( - "env", - env_config={ - "space": Dict( - { - "a": Tuple( - [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})] - ), - "b": Box(-10.0, 10.0, (2,)), - "c": MultiDiscrete([3, 3]), - "d": Discrete(2), - } - ), - "episode_len": 100, - }, - ) - .framework(args.framework) - .rollouts( - env_to_module_connector=_env_to_module_pipeline, - num_rollout_workers=args.num_env_runners, - # Setup the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ), - ) - # No history in Env (bandit problem). - .training( - gamma=0.0, - lr=0.0005, - model=( - {} if not args.enable_new_api_stack else {"uses_new_env_runners": True} - ), - ) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - # Add a simple multi-agent setup. - if args.num_agents > 0: - config.multi_agent( - policies={f"p{i}" for i in range(args.num_agents)}, - policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", - ) - - # Fix some PPO-specific settings. - if args.algo == "PPO": - config.training( - # We don't want high entropy in this Env. - entropy_coeff=0.00005, - num_sgd_iter=4, - vf_loss_coeff=0.01, - ) - - # Run everything as configured. - run_rllib_example_script_experiment(config, args) +raise NotImplementedError(msg) diff --git a/rllib/examples/parametric_actions_cartpole.py b/rllib/examples/parametric_actions_cartpole.py index 2c1439a3ff3b6..1442945e96d0d 100644 --- a/rllib/examples/parametric_actions_cartpole.py +++ b/rllib/examples/parametric_actions_cartpole.py @@ -1,110 +1,6 @@ -"""Example of handling variable length and/or parametric action spaces. - -This is a toy example of the action-embedding based approach for handling large -discrete action spaces (potentially infinite in size), similar to this: - - https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ - -This currently works with RLlib's policy gradient style algorithms -(e.g., PG, PPO, IMPALA, A2C) and also DQN. - -Note that since the model outputs now include "-inf" tf.float32.min -values, not all algorithm options are supported at the moment. For example, -algorithms might crash if they don't properly ignore the -inf action scores. -Working configurations are given below. +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.parametric_actions_cartpole.py` """ -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.examples.env.parametric_actions_cartpole import ParametricActionsCartPole -from ray.rllib.examples.models.parametric_actions_model import ( - ParametricActionsModel, - TorchParametricActionsModel, -) -from ray.rllib.models import ModelCatalog -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import register_env - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=200, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." -) - -if __name__ == "__main__": - args = parser.parse_args() - ray.init() - - register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) - ModelCatalog.register_custom_model( - "pa_model", - TorchParametricActionsModel - if args.framework == "torch" - else ParametricActionsModel, - ) - - if args.run == "DQN": - cfg = { - # TODO(ekl) we need to set these to prevent the masked values - # from being further processed in DistributionalQModel, which - # would mess up the masking. It is possible to support these if we - # defined a custom DistributionalQModel that is aware of masking. - "hiddens": [], - "dueling": False, - } - else: - cfg = {} - - config = dict( - { - "env": "pa_cartpole", - "model": { - "custom_model": "pa_model", - }, - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), - "num_workers": 0, - "framework": args.framework, - }, - **cfg - ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - results = tune.Tuner( - args.run, - run_config=air.RunConfig(stop=stop, verbose=1), - param_space=config, - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/parametric_actions_cartpole_embeddings_learnt_by_model.py b/rllib/examples/parametric_actions_cartpole_embeddings_learnt_by_model.py index a2f22791813e1..d2897a0ee61c9 100644 --- a/rllib/examples/parametric_actions_cartpole_embeddings_learnt_by_model.py +++ b/rllib/examples/parametric_actions_cartpole_embeddings_learnt_by_model.py @@ -1,98 +1,7 @@ -"""Example of handling variable length and/or parametric action spaces. - -This is a toy example of the action-embedding based approach for handling large -discrete action spaces (potentially infinite in size), similar to this: - - https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ - -This currently works with RLlib's policy gradient style algorithms -(e.g., PG, PPO, IMPALA, A2C) and also DQN. - -Note that since the model outputs now include "-inf" tf.float32.min -values, not all algorithm options are supported at the moment. For example, -algorithms might crash if they don't properly ignore the -inf action scores. -Working configurations are given below. +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack. +parametric_actions_cartpole_embeddings_learnt_by_model.py` """ -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.examples.env.parametric_actions_cartpole import ( - ParametricActionsCartPoleNoEmbeddings, -) -from ray.rllib.examples.models.parametric_actions_model import ( - ParametricActionsModelThatLearnsEmbeddings, -) -from ray.rllib.models import ModelCatalog -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import register_env - -parser = argparse.ArgumentParser() -parser.add_argument("--run", type=str, default="PPO") -parser.add_argument( - "--framework", - choices=["tf", "tf2"], - default="tf", - help="The DL framework specifier (torch not supported yet " - "due to lack of model).", -) -parser.add_argument("--as-test", action="store_true") -parser.add_argument("--stop-iters", type=int, default=200) -parser.add_argument("--stop-reward", type=float, default=150.0) -parser.add_argument("--stop-timesteps", type=int, default=100000) - -if __name__ == "__main__": - args = parser.parse_args() - ray.init() - - register_env("pa_cartpole", lambda _: ParametricActionsCartPoleNoEmbeddings(10)) - - ModelCatalog.register_custom_model( - "pa_model", ParametricActionsModelThatLearnsEmbeddings - ) - - if args.run == "DQN": - cfg = { - # TODO(ekl) we need to set these to prevent the masked values - # from being further processed in DistributionalQModel, which - # would mess up the masking. It is possible to support these if we - # defined a custom DistributionalQModel that is aware of masking. - "hiddens": [], - "dueling": False, - } - else: - cfg = {} - - config = dict( - { - "env": "pa_cartpole", - "model": { - "custom_model": "pa_model", - }, - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), - "num_workers": 0, - "framework": args.framework, - "action_mask_key": "valid_avail_actions_mask", - }, - **cfg - ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - results = tune.Tuner( - args.run, - run_config=air.RunConfig(stop=stop, verbose=2), - param_space=config, - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/remote_base_env_with_custom_api.py b/rllib/examples/remote_base_env_with_custom_api.py index 59c29df1d681c..e40ccc444a041 100644 --- a/rllib/examples/remote_base_env_with_custom_api.py +++ b/rllib/examples/remote_base_env_with_custom_api.py @@ -1,147 +1,6 @@ +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.remote_base_env_with_custom_api.py` """ -This script demonstrates how one can specify custom env APIs in -combination with RLlib's `remote_worker_envs` setting, which -parallelizes individual sub-envs within a vector env by making each -one a ray Actor. -You can access your Env's API via a custom callback as shown below. -""" -import argparse -import gymnasium as gym -import os - -import ray -from ray import air, tune -from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.env.apis.task_settable_env import TaskSettableEnv -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import get_trainable_cls - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument("--num-workers", type=int, default=1) - -# This should be >1, otherwise, remote envs make no sense. -parser.add_argument("--num-envs-per-worker", type=int, default=4) - -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=180.0, help="Reward at which we stop training." -) -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) - - -class NonVectorizedEnvToBeVectorizedIntoRemoteBaseEnv(TaskSettableEnv): - """Class for a single sub-env to be vectorized into RemoteBaseEnv. - - If you specify this class directly under the "env" config key, RLlib - will auto-wrap - - Note that you may implement your own custom APIs. Here, we demonstrate - using RLlib's TaskSettableEnv API (which is a simple sub-class - of gym.Env). - """ - - def __init__(self, config=None): - super().__init__() - self.action_space = gym.spaces.Box(0, 1, shape=(1,)) - self.observation_space = gym.spaces.Box(0, 1, shape=(2,)) - self.task = 1 - - def reset(self, *, seed=None, options=None): - self.steps = 0 - return self.observation_space.sample(), {} - - def step(self, action): - self.steps += 1 - done = truncated = self.steps > 10 - return self.observation_space.sample(), 0, done, truncated, {} - - def set_task(self, task) -> None: - """We can set the task of each sub-env (ray actor)""" - print("Task set to {}".format(task)) - self.task = task - - -class TaskSettingCallback(DefaultCallbacks): - """Custom callback to verify, we can set the task on each remote sub-env.""" - - def on_train_result(self, *, algorithm, result: dict, **kwargs) -> None: - """Curriculum learning as seen in Ray docs""" - if result["episode_reward_mean"] > 0.0: - phase = 0 - else: - phase = 1 - - # Sub-envs are now ray.actor.ActorHandles, so we have to add - # `remote()` here. - algorithm.workers.foreach_env(lambda env: env.set_task.remote(phase)) - - -if __name__ == "__main__": - args = parser.parse_args() - ray.init(num_cpus=6, local_mode=args.local_mode) - - config = ( - get_trainable_cls(args.run) - .get_default_config() - # Specify your custom (single, non-vectorized) env directly as a - # class. This way, RLlib can auto-create Actors from this class - # and handle everything correctly. - .environment(NonVectorizedEnvToBeVectorizedIntoRemoteBaseEnv) - .framework(args.framework) - # Set up our own callbacks. - .callbacks(TaskSettingCallback) - .rollouts( - # Force sub-envs to be ray.actor.ActorHandles, so we can step - # through them in parallel. - remote_worker_envs=True, - # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_worker`)? - num_rollout_workers=args.num_workers, - # This setting should not really matter as it does not affect the - # number of GPUs reserved for each worker. - num_envs_per_worker=args.num_envs_per_worker, - ) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - results = tune.Tuner( - args.run, - param_space=config, - run_config=air.RunConfig(stop=stop, verbose=1), - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/remote_envs_with_inference_done_on_main_node.py index 7dbfd888b77e8..fca96a7e461c3 100644 --- a/rllib/examples/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/remote_envs_with_inference_done_on_main_node.py @@ -1,179 +1,6 @@ +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.remote_base_envs_with_inference_done_on_main_node.py` """ -This script demonstrates how one can specify n (vectorized) envs -as ray remote (actors), such that stepping through these occurs in parallel. -Also, actions for each env step will be calculated on the "main" node. -This can be useful if the "main" node is a GPU machine and we would like to -speed up batched action calculations, similar to DeepMind's SEED -architecture, described here: - -https://ai.googleblog.com/2020/03/massively-scaling-reinforcement.html -""" -import argparse -import os -from typing import Union - -import ray -from ray import air, tune -from ray.rllib.algorithms.ppo import PPO, PPOConfig -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.utils.annotations import override -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.rllib.utils.typing import PartialAlgorithmConfigDict -from ray.tune import PlacementGroupFactory -from ray.tune.logger import pretty_print - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - - # example-specific args - # This should be >1, otherwise, remote envs make no sense. - parser.add_argument("--num-envs-per-worker", type=int, default=4) - - # general args - parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", - ) - parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." - ) - parser.add_argument( - "--stop-timesteps", - type=int, - default=100000, - help="Number of timesteps to train.", - ) - parser.add_argument( - "--stop-reward", - type=float, - default=150.0, - help="Reward at which we stop training.", - ) - parser.add_argument( - "--no-tune", - action="store_true", - help="Run without Tune using a manual train loop instead. Here," - "there is no TensorBoard support.", - ) - parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", - ) - - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -# The modified Algorithm class we will use: -# Subclassing from PPO, our algo will only modity `default_resource_request`, -# telling Ray Tune that it's ok (not mandatory) to place our n remote envs on a -# different node (each env using 1 CPU). -class PPORemoteInference(PPO): - @classmethod - @override(Algorithm) - def default_resource_request( - cls, - config: Union[AlgorithmConfig, PartialAlgorithmConfigDict], - ): - if isinstance(config, AlgorithmConfig): - cf = config - else: - cf = cls.get_default_config().update_from_dict(config) - - # Return PlacementGroupFactory containing all needed resources - # (already properly defined as device bundles). - return PlacementGroupFactory( - bundles=[ - { - # Single CPU for the local worker. This CPU will host the - # main model in this example (num_workers=0). - "CPU": 1, - # Possibly add n GPUs to this. - "GPU": cf.num_gpus, - }, - { - # Different bundle (meaning: possibly different node) - # for your n "remote" envs (set remote_worker_envs=True). - "CPU": cf.num_envs_per_worker, - }, - ], - strategy=cf.placement_strategy, - ) - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init(num_cpus=6, local_mode=args.local_mode) - - config = ( - PPOConfig() - .environment("CartPole-v1") - .framework(args.framework) - .rollouts( - # Force sub-envs to be ray.actor.ActorHandles, so we can step - # through them in parallel. - remote_worker_envs=True, - num_envs_per_worker=args.num_envs_per_worker, - # Use a single worker (however, with n parallelized remote envs, maybe - # even running on another node). - # Action computations will occur on the "main" (GPU?) node, while - # the envs run on one or more CPU node(s). - num_rollout_workers=0, - ) - .resources( - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), - # Set the number of CPUs used by the (local) worker, aka "driver" - # to match the number of ray remote envs. - num_cpus_for_local_worker=args.num_envs_per_worker + 1, - ) - ) - - # Run as manual training loop. - if args.no_tune: - # manual training loop using PPO and manually keeping track of state - algo = PPORemoteInference(config=config) - # run manual training loop and print results after each iteration - for _ in range(args.stop_iters): - result = algo.train() - print(pretty_print(result)) - # Stop training if the target train steps or reward are reached. - if ( - result["timesteps_total"] >= args.stop_timesteps - or result["episode_reward_mean"] >= args.stop_reward - ): - break - - # Run with Tune for auto env and algorithm creation and TensorBoard. - else: - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - results = tune.Tuner( - PPORemoteInference, - param_space=config, - run_config=air.RunConfig(stop=stop, verbose=1), - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/rl_module/classes/__init__.py b/rllib/examples/rl_module/classes/__init__.py new file mode 100644 index 0000000000000..45f7c0572979b --- /dev/null +++ b/rllib/examples/rl_module/classes/__init__.py @@ -0,0 +1,10 @@ +from ray.rllib.examples.rl_module.classes.rock_paper_scissors_heuristic_rlm import ( + AlwaysSameHeuristicRLM, + BeatLastHeuristicRLM, +) + + +__all__ = [ + "AlwaysSameHeuristicRLM", + "BeatLastHeuristicRLM", +] diff --git a/rllib/examples/rl_module/classes/rock_paper_scissors_heuristic_rlm.py b/rllib/examples/rl_module/classes/rock_paper_scissors_heuristic_rlm.py new file mode 100644 index 0000000000000..f4b3d661f4de3 --- /dev/null +++ b/rllib/examples/rl_module/classes/rock_paper_scissors_heuristic_rlm.py @@ -0,0 +1,108 @@ +from collections import defaultdict + +import numpy as np + +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override + + +class AlwaysSameHeuristicRLM(RLModule): + """In rock-paper-scissors, always chooses the same action within an episode. + + The first move is random, all the following moves are the same as the first one. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._actions_per_vector_idx = defaultdict(int) + + @override(RLModule) + def _forward_inference(self, batch, **kwargs): + ret = [] + # Note that the obs is the previous move of the opponens (0-2). If it's 3, it + # means that there was no previous move and thus, the episode just started. + for i, obs in enumerate(batch[Columns.OBS]): + if obs == 3: + self._actions_per_vector_idx[i] = np.random.choice([0, 1, 2]) + ret.append(self._actions_per_vector_idx[i]) + return {Columns.ACTIONS: np.array(ret)} + + @override(RLModule) + def _forward_exploration(self, batch, **kwargs): + return self._forward_inference(batch, **kwargs) + + @override(RLModule) + def _forward_train(self, batch, **kwargs): + raise NotImplementedError( + "AlwaysSameHeuristicRLM is not trainable! Make sure you do NOT include it " + "in your `config.multi_agent(policies_to_train={...})` set." + ) + + @override(RLModule) + def output_specs_inference(self): + return [Columns.ACTIONS] + + @override(RLModule) + def output_specs_exploration(self): + return [Columns.ACTIONS] + + +class BeatLastHeuristicRLM(RLModule): + """In rock-paper-scissors, always acts such that it beats prev. move of opponent. + + The first move is random. + + For example, after opponent played `rock` (and this policy made a random + move), the next move would be `paper`(to beat `rock`). + """ + + @override(RLModule) + def _forward_inference(self, batch, **kwargs): + """Returns the exact action that would beat the previous action of the opponent. + + The opponent's previous action is the current observation for this agent. + + Both action- and observation spaces are discrete. There are 3 actions available. + (0-2) and 4 observations (0-2 plus 3, where 3 is the observation after the env + reset, when no action has been taken yet). Thereby: + 0=Rock + 1=Paper + 2=Scissors + 3=[after reset] (observation space only) + """ + return { + Columns.ACTIONS: np.array( + [self._pick_single_action(obs) for obs in batch[Columns.OBS]] + ), + } + + @override(RLModule) + def _forward_exploration(self, batch, **kwargs): + return self._forward_inference(batch, **kwargs) + + @override(RLModule) + def _forward_train(self, batch, **kwargs): + raise NotImplementedError( + "BeatLastHeuristicRLM is not trainable! Make sure you do NOT include it in " + "your `config.multi_agent(policies_to_train={...})` set." + ) + + @override(RLModule) + def output_specs_inference(self): + return [Columns.ACTIONS] + + @override(RLModule) + def output_specs_exploration(self): + return [Columns.ACTIONS] + + @staticmethod + def _pick_single_action(prev_opponent_obs): + if prev_opponent_obs == 0: + return 1 + elif prev_opponent_obs == 1: + return 2 + elif prev_opponent_obs == 2: + return 0 + else: + return np.random.choice([0, 1, 2]) diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py index 5688d62b7f235..63b0d5c395249 100644 --- a/rllib/examples/rock_paper_scissors_multiagent.py +++ b/rllib/examples/rock_paper_scissors_multiagent.py @@ -1,218 +1,6 @@ -"""A simple multi-agent env with two agents playing rock paper scissors. - -This demonstrates running the following policies in competition: - (1) heuristic policy of repeating the same move - (2) heuristic policy of beating the last opponent move - (3) LSTM/feedforward PPO policies - (4) LSTM policy with custom entropy loss +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.rock_paper_scissors.py` """ -import argparse -import os -from pettingzoo.classic import rps_v2 -import random - -import ray -from ray import air, tune -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.algorithms.ppo import ( - PPO, - PPOConfig, - PPOTF1Policy, - PPOTF2Policy, - PPOTorchPolicy, -) -from ray.rllib.env import PettingZooEnv -from ray.rllib.examples.policy.rock_paper_scissors_dummies import ( - BeatLastHeuristic, - AlwaysSameHeuristic, -) -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import register_env - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - -parser = argparse.ArgumentParser() -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=150, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", - type=float, - default=1000.0, - help="Reward at which we stop training.", -) - - -def env_creator(args): - env = rps_v2.env() - return env - - -register_env("RockPaperScissors", lambda config: PettingZooEnv(env_creator(config))) - - -def run_same_policy(args, stop): - """Use the same policy for both agents (trivial case).""" - config = PPOConfig().environment("RockPaperScissors").framework(args.framework) - - results = tune.Tuner( - "PPO", param_space=config, run_config=air.RunConfig(stop=stop, verbose=1) - ).fit() - - if args.as_test: - # Check vs 0.0 as we are playing a zero-sum game. - check_learning_achieved(results, 0.0) - - -def run_heuristic_vs_learned(args, use_lstm=False, algorithm_config=None): - """Run heuristic policies vs a learned agent. - - The learned agent should eventually reach a reward of ~5 with - use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy - can perform better is since it can distinguish between the always_same vs - beat_last heuristics. - """ - - def select_policy(agent_id, episode, **kwargs): - if agent_id == "player_0": - return "learned" - else: - return random.choice(["always_same", "beat_last"]) - - config = ( - (algorithm_config or PPOConfig()) - .environment("RockPaperScissors") - .framework(args.framework) - .rollouts( - num_rollout_workers=0, - num_envs_per_worker=4, - ) - .multi_agent( - policies={ - "always_same": PolicySpec(policy_class=AlwaysSameHeuristic), - "beat_last": PolicySpec(policy_class=BeatLastHeuristic), - "learned": PolicySpec( - config=AlgorithmConfig.overrides( - model={"use_lstm": use_lstm}, - framework_str=args.framework, - ) - ), - }, - policy_mapping_fn=select_policy, - policies_to_train=["learned"], - ) - .reporting(metrics_num_episodes_for_smoothing=200) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - algo = config.build() - - reward_diff = 0 - for _ in range(args.stop_iters): - results = algo.train() - # Timesteps reached. - if "policy_always_same_reward" not in results["hist_stats"]: - reward_diff = 0 - continue - reward_diff = sum(results["hist_stats"]["policy_learned_reward"]) - print(f"delta_r={reward_diff}") - if results["timesteps_total"] > args.stop_timesteps: - break - # Reward (difference) reached -> all good, return. - elif reward_diff > args.stop_reward: - return - - # Reward (difference) not reached: Error if `as_test`. - if args.as_test: - raise ValueError( - "Desired reward difference ({}) not reached! Only got to {}.".format( - args.stop_reward, reward_diff - ) - ) - - -def run_with_custom_entropy_loss(args, stop): - """Example of customizing the loss function of an existing policy. - - This performs about the same as the default loss does.""" - - policy_cls = { - "torch": PPOTorchPolicy, - "tf": PPOTF1Policy, - "tf2": PPOTF2Policy, - }[args.framework] - - class EntropyPolicy(policy_cls): - def loss_fn(policy, model, dist_class, train_batch): - logits, _ = model(train_batch) - action_dist = dist_class(logits, model) - if args.framework == "torch": - # Required by PPOTorchPolicy's stats fn. - model.tower_stats["policy_loss"] = torch.tensor([0.0]) - policy.policy_loss = torch.mean( - -0.1 * action_dist.entropy() - - ( - action_dist.logp(train_batch["actions"]) - * train_batch["advantages"] - ) - ) - else: - policy.policy_loss = -0.1 * action_dist.entropy() - tf.reduce_mean( - action_dist.logp(train_batch["actions"]) * train_batch["advantages"] - ) - return policy.policy_loss - - class EntropyLossPPO(PPO): - @classmethod - def get_default_policy_class(cls, config): - return EntropyPolicy - - run_heuristic_vs_learned( - args, - use_lstm=True, - algorithm_config=PPOConfig(algo_class=EntropyLossPPO), - ) - - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init() - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - run_same_policy(args, stop=stop) - print("run_same_policy: ok.") - - run_heuristic_vs_learned(args, use_lstm=False) - print("run_heuristic_vs_learned(w/o lstm): ok.") - - run_heuristic_vs_learned(args, use_lstm=True) - print("run_heuristic_vs_learned (w/ lstm): ok.") - - run_with_custom_entropy_loss(args, stop=stop) - print("run_with_custom_entropy_loss: ok.") +raise NotImplementedError(msg) diff --git a/rllib/examples/sb2rllib_rllib_example.py b/rllib/examples/sb2rllib_rllib_example.py index 7e308ceca30fc..0edf3f0b2c686 100644 --- a/rllib/examples/sb2rllib_rllib_example.py +++ b/rllib/examples/sb2rllib_rllib_example.py @@ -1,50 +1,6 @@ +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.sb2rllib_rllib_example.py` """ -Example script on how to train, save, load, and test an RLlib agent. -Equivalent script with stable baselines: sb2rllib_sb_example.py. -Demonstrates transition from stable_baselines to Ray RLlib. -Run example: python sb2rllib_rllib_example.py -""" -import gymnasium as gym -from ray import tune, air -import ray.rllib.algorithms.ppo as ppo - -# settings used for both stable baselines and rllib -env_name = "CartPole-v1" -train_steps = 10000 -learning_rate = 1e-3 -save_dir = "saved_models" - -# training and saving -analysis = tune.Tuner( - "PPO", - run_config=air.RunConfig( - stop={"timesteps_total": train_steps}, - local_dir=save_dir, - checkpoint_config=air.CheckpointConfig( - checkpoint_at_end=True, - ), - ), - param_space={"env": env_name, "lr": learning_rate}, -).fit() -# retrieve the checkpoint path -analysis.default_metric = "episode_reward_mean" -analysis.default_mode = "max" -checkpoint_path = analysis.get_best_checkpoint(trial=analysis.get_best_trial()) -print(f"Trained model saved at {checkpoint_path}") - -# load and restore model -agent = ppo.PPO(env=env_name) -agent.restore(checkpoint_path) -print(f"Agent loaded from saved model at {checkpoint_path}") - -# inference -env = gym.make(env_name) -obs, info = env.reset() -for i in range(1000): - action = agent.compute_single_action(obs) - obs, reward, terminated, truncated, info = env.step(action) - env.render() - if terminated or truncated: - print(f"Cart pole ended after {i} steps.") - break +raise NotImplementedError(msg) diff --git a/rllib/examples/sb2rllib_sb_example.py b/rllib/examples/sb2rllib_sb_example.py index 63dece13e272b..425d31aeb179e 100644 --- a/rllib/examples/sb2rllib_sb_example.py +++ b/rllib/examples/sb2rllib_sb_example.py @@ -1,40 +1,6 @@ +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.sb2rllib_sb_example.py` """ -Example script on how to train, save, load, and test a stable baselines 2 agent -Code taken and adjusted from SB2 docs: -https://stable-baselines.readthedocs.io/en/master/guide/quickstart.html -Equivalent script with RLlib: sb2rllib_rllib_example.py -""" -import gymnasium as gym - -from stable_baselines.common.policies import MlpPolicy -from stable_baselines import PPO2 - -# settings used for both stable baselines and rllib -env_name = "CartPole-v1" -train_steps = 10000 -learning_rate = 1e-3 -save_dir = "saved_models" - -save_path = f"{save_dir}/sb_model_{train_steps}steps" -env = gym.make(env_name) - -# training and saving -model = PPO2(MlpPolicy, env, learning_rate=learning_rate, verbose=1) -model.learn(total_timesteps=train_steps) -model.save(save_path) -print(f"Trained model saved at {save_path}") - -# delete and load model (just for illustration) -del model -model = PPO2.load(save_path) -print(f"Agent loaded from saved model at {save_path}") -# inference -obs, info = env.reset() -for i in range(1000): - action, _states = model.predict(obs) - obs, reward, terminated, truncated, info = env.step(action) - env.render() - if terminated or truncated: - print(f"Cart pole ended after {i} steps.") - break +raise NotImplementedError(msg) diff --git a/rllib/examples/self_play_league_based_with_open_spiel.py b/rllib/examples/self_play_league_based_with_open_spiel.py index 87d019b7463f3..5d3326083550b 100644 --- a/rllib/examples/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/self_play_league_based_with_open_spiel.py @@ -1,288 +1,6 @@ -"""Example showing how one can implement a league-based training workflow. - -Uses the open spiel adapter of RLlib with the "markov_soccer" game and -a simplified multi-agent, league-based setup: -https://deepmind.com/blog/article/AlphaStar-Grandmaster-level-in- \ -StarCraft-II-using-multi-agent-reinforcement-learning - -Our league consists of three groups of policies: -- main policies: The current main policy plus prior versions of it. -- main exploiters: Trained by playing only against different "main policies". -- league exploiters: Trained by playing against any policy in the league. - -We start with 1 policy from each group, setting all 3 of these to an initial -PPO policy and allowing all 3 policies to be trained. -After each train update - via our custom callback - we decide for each -trainable policy, whether to make a copy and freeze it. Frozen policies -will not be altered anymore. However, they remain in the league for -future matches against trainable policies. -Matchmaking happens via a policy_mapping_fn, which needs to be altered -after every change (addition) to the league. The mapping function -randomly maps agents in a way, such that: -- Frozen main exploiters play against the one (currently trainable) main - policy. -- Trainable main exploiters play against any main policy (including already - frozen main policies). -- Frozen league exploiters play against any trainable policy in the league. -- Trainable league exploiters play against any policy in the league. - -After training for n iterations, a configurable number of episodes can -be played by the user against the "main" agent on the command line. +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.self_play_league_based_with_open_spiel.py` """ -import functools - -import numpy as np - -import ray -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec -from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel -from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv -from ray.rllib.examples.multi_agent_and_self_play import ( - SelfPlayLeagueBasedCallback, - SelfPlayLeagueBasedCallbackOldAPIStack, -) -from ray.rllib.examples.policy.random_policy import RandomPolicy -from ray.rllib.examples.rl_module.random_rl_module import RandomRLModule -from ray.rllib.examples.self_play_with_open_spiel import ask_user_for_action -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls, register_env - -open_spiel = try_import_open_spiel(error=True) -pyspiel = try_import_pyspiel(error=True) - -# Import after try_import_open_spiel, so we can error out with hints -from open_spiel.python.rl_environment import Environment # noqa: E402 - - -parser = add_rllib_example_script_args(default_timesteps=2000000) -parser.add_argument( - "--env", - type=str, - default="markov_soccer", - choices=["markov_soccer", "connect_four"], -) -parser.add_argument( - "--win-rate-threshold", - type=float, - default=0.85, - help="Win-rate at which we setup another opponent by freezing the " - "current main policy and playing against a uniform distribution " - "of previously frozen 'main's from here on.", -) -parser.add_argument( - "--min-league-size", - type=float, - default=8, - help="Minimum number of policies/RLModules to consider the test passed. " - "The initial league size is 2: `main` and `random`. " - "`--min-league-size=3` thus means that one new policy/RLModule has been " - "added so far (b/c the `main` one has reached the `--win-rate-threshold " - "against the `random` Policy/RLModule).", -) -parser.add_argument( - "--num-episodes-human-play", - type=int, - default=0, - help="How many episodes to play against the user on the command " - "line after training has finished.", -) -parser.add_argument( - "--from-checkpoint", - type=str, - default=None, - help="Full path to a checkpoint file for restoring a previously saved " - "Algorithm state.", -) - - -if __name__ == "__main__": - args = parser.parse_args() - - register_env( - "open_spiel_env", - lambda _: OpenSpielEnv(pyspiel.load_game(args.env)), - ) - - def policy_mapping_fn(agent_id, episode, worker=None, **kwargs): - # At first, only have main play against the random main exploiter. - return "main" if episode.episode_id % 2 == agent_id else "main_exploiter_0" - - def agent_to_module_mapping_fn(agent_id, episode, **kwargs): - # At first, only have main play against the random main exploiter. - return "main" if hash(episode.id_) % 2 == agent_id else "main_exploiter_0" - - def _get_multi_agent(): - names = { - # Our main policy, we'd like to optimize. - "main", - # First frozen version of main (after we reach n% win-rate). - "main_0", - # Initial main exploiters (one random, one trainable). - "main_exploiter_0", - "main_exploiter_1", - # Initial league exploiters (one random, one trainable). - "league_exploiter_0", - "league_exploiter_1", - } - if args.enable_new_api_stack: - policies = names - spec = { - mid: SingleAgentRLModuleSpec( - module_class=( - RandomRLModule - if mid in ["main_exploiter_0", "league_exploiter_0"] - else None - ) - ) - for mid in names - } - else: - policies = { - mid: PolicySpec( - policy_class=( - RandomPolicy - if mid in ["main_exploiter_0", "league_exploiter_0"] - else None - ) - ) - for mid in names - } - spec = None - return {"policies": policies, "spec": spec} - - config = ( - get_trainable_cls(args.algo) - .get_default_config() - # Use new API stack ... - .experimental(_enable_new_api_stack=args.enable_new_api_stack) - .environment("open_spiel_env") - .framework(args.framework) - # Set up the main piece in this experiment: The league-bases self-play - # callback, which controls adding new policies/Modules to the league and - # properly matching the different policies in the league with each other. - .callbacks( - functools.partial( - SelfPlayLeagueBasedCallback - if args.enable_new_api_stack - else SelfPlayLeagueBasedCallbackOldAPIStack, - win_rate_threshold=args.win_rate_threshold, - ) - ) - .rollouts( - num_rollout_workers=args.num_env_runners, - num_envs_per_worker=1 if args.enable_new_api_stack else 5, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None if not args.enable_new_api_stack else MultiAgentEnvRunner - ), - ) - .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, - ) - .training( - num_sgd_iter=20, - model=dict( - **({"uses_new_env_runners": True} if args.enable_new_api_stack else {}), - ), - ) - .multi_agent( - # Initial policy map: All PPO. This will be expanded - # to more policy snapshots. This is done in the - # custom callback defined above (`LeagueBasedSelfPlayCallback`). - policies=_get_multi_agent()["policies"], - policy_mapping_fn=( - agent_to_module_mapping_fn - if args.enable_new_api_stack - else policy_mapping_fn - ), - # At first, only train main_0 (until good enough to win against - # random). - policies_to_train=["main"], - ) - .rl_module( - rl_module_spec=MultiAgentRLModuleSpec( - module_specs=_get_multi_agent()["spec"] - ), - ) - ) - - # Run everything as configured. - # Train the "main" policy to play really well using self-play. - results = None - if not args.from_checkpoint: - stop = { - "timesteps_total": args.stop_timesteps, - "training_iteration": args.stop_iters, - "league_size": args.min_league_size, - } - results = run_rllib_example_script_experiment(config, args, stop) - - # Restore trained Algorithm (set to non-explore behavior) and play against - # human on command line. - if args.num_episodes_human_play > 0: - num_episodes = 0 - # Switch off exploration for better inference performance. - config.explore = False - algo = config.build() - if args.from_checkpoint: - algo.restore(args.from_checkpoint) - else: - checkpoint = results.get_best_result().checkpoint - if not checkpoint: - raise ValueError("No last checkpoint found in results!") - algo.restore(checkpoint) - - # Play from the command line against the trained agent - # in an actual (non-RLlib-wrapped) open-spiel env. - human_player = 1 - env = Environment(args.env) - - while num_episodes < args.num_episodes_human_play: - print("You play as {}".format("o" if human_player else "x")) - time_step = env.reset() - while not time_step.last(): - player_id = time_step.observations["current_player"] - if player_id == human_player: - action = ask_user_for_action(time_step) - else: - obs = np.array(time_step.observations["info_state"][player_id]) - if config.uses_new_env_runners: - action = algo.workers.local_worker().module.forward_inference( - {"obs": obs} - ) - else: - action = algo.compute_single_action(obs, policy_id="main") - # In case computer chooses an invalid action, pick a - # random one. - legal = time_step.observations["legal_actions"][player_id] - if action not in legal: - action = np.random.choice(legal) - time_step = env.step([action]) - print(f"\n{env.get_state}") - - print(f"\n{env.get_state}") - - print("End of game!") - if time_step.rewards[human_player] > 0: - print("You win") - elif time_step.rewards[human_player] < 0: - print("You lose") - else: - print("Draw") - # Switch order of players - human_player = 1 - human_player - - num_episodes += 1 - - algo.stop() - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/self_play_with_open_spiel.py b/rllib/examples/self_play_with_open_spiel.py index 9d0c4844cb609..0f060c8fcbf42 100644 --- a/rllib/examples/self_play_with_open_spiel.py +++ b/rllib/examples/self_play_with_open_spiel.py @@ -1,241 +1,6 @@ -"""Example showing how one can implement a simple self-play training workflow. - -Uses the open spiel adapter of RLlib with the "connect_four" game and -a multi-agent setup with a "main" policy and n "main_v[x]" policies -(x=version number), which are all at-some-point-frozen copies of -"main". At the very beginning, "main" plays against RandomPolicy. - -Checks for the training progress after each training update via a custom -callback. We simply measure the win rate of "main" vs the opponent -("main_v[x]" or RandomPolicy at the beginning) by looking through the -achieved rewards in the episodes in the train batch. If this win rate -reaches some configurable threshold, we add a new policy to -the policy map (a frozen copy of the current "main" one) and change the -policy_mapping_fn to make new matches of "main" vs any of the previous -versions of "main" (including the just added one). - -After training for n iterations, a configurable number of episodes can -be played by the user against the "main" agent on the command line. +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.self_play_with_open_spiel.py` """ -import functools - -import numpy as np - -from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel -from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv -from ray.rllib.examples.rl_module.random_rl_module import RandomRLModule -from ray.rllib.examples.multi_agent_and_self_play import ( - ask_user_for_action, - SelfPlayCallback, - SelfPlayCallbackOldAPIStack, -) -from ray.rllib.examples.policy.random_policy import RandomPolicy -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls, register_env - -open_spiel = try_import_open_spiel(error=True) -pyspiel = try_import_pyspiel(error=True) - -# Import after try_import_open_spiel, so we can error out with hints -from open_spiel.python.rl_environment import Environment # noqa: E402 - - -parser = add_rllib_example_script_args(default_timesteps=2000000) -parser.add_argument( - "--env", - type=str, - default="connect_four", - choices=["markov_soccer", "connect_four"], -) -parser.add_argument( - "--win-rate-threshold", - type=float, - default=0.95, - help="Win-rate at which we setup another opponent by freezing the " - "current main policy and playing against a uniform distribution " - "of previously frozen 'main's from here on.", -) -parser.add_argument( - "--min-league-size", - type=float, - default=3, - help="Minimum number of policies/RLModules to consider the test passed. " - "The initial league size is 2: `main` and `random`. " - "`--min-league-size=3` thus means that one new policy/RLModule has been " - "added so far (b/c the `main` one has reached the `--win-rate-threshold " - "against the `random` Policy/RLModule).", -) -parser.add_argument( - "--num-episodes-human-play", - type=int, - default=10, - help="How many episodes to play against the user on the command " - "line after training has finished.", -) -parser.add_argument( - "--from-checkpoint", - type=str, - default=None, - help="Full path to a checkpoint file for restoring a previously saved " - "Algorithm state.", -) - - -if __name__ == "__main__": - args = parser.parse_args() - - register_env("open_spiel_env", lambda _: OpenSpielEnv(pyspiel.load_game(args.env))) - - def agent_to_module_mapping_fn(agent_id, episode, **kwargs): - # agent_id = [0|1] -> module depends on episode ID - # This way, we make sure that both modules sometimes play agent0 - # (start player) and sometimes agent1 (player to move 2nd). - return "main" if hash(episode.id_) % 2 == agent_id else "random" - - def policy_mapping_fn(agent_id, episode, worker, **kwargs): - return "main" if episode.episode_id % 2 == agent_id else "random" - - config = ( - get_trainable_cls(args.algo) - .get_default_config() - .experimental(_enable_new_api_stack=args.enable_new_api_stack) - .environment("open_spiel_env") - .framework(args.framework) - # Set up the main piece in this experiment: The league-bases self-play - # callback, which controls adding new policies/Modules to the league and - # properly matching the different policies in the league with each other. - .callbacks( - functools.partial( - SelfPlayCallback - if args.enable_new_api_stack - else SelfPlayCallbackOldAPIStack, - win_rate_threshold=args.win_rate_threshold, - ) - ) - .rollouts( - num_rollout_workers=args.num_env_runners, - num_envs_per_worker=1 if args.enable_new_api_stack else 5, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None if not args.enable_new_api_stack else MultiAgentEnvRunner - ), - ) - .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, - ) - .training(model={"fcnet_hiddens": [512, 512]}) - .multi_agent( - # Initial policy map: Random and default algo one. This will be expanded - # to more policy snapshots taken from "main" against which "main" - # will then play (instead of "random"). This is done in the - # custom callback defined above (`SelfPlayCallback`). - policies=( - { - # Our main policy, we'd like to optimize. - "main": PolicySpec(), - # An initial random opponent to play against. - "random": PolicySpec(policy_class=RandomPolicy), - } - if not args.enable_new_api_stack - else {"main", "random"} - ), - # Assign agent 0 and 1 randomly to the "main" policy or - # to the opponent ("random" at first). Make sure (via episode_id) - # that "main" always plays against "random" (and not against - # another "main"). - policy_mapping_fn=( - agent_to_module_mapping_fn - if args.enable_new_api_stack - else policy_mapping_fn - ), - # Always just train the "main" policy. - policies_to_train=["main"], - ) - .rl_module( - rl_module_spec=MultiAgentRLModuleSpec( - module_specs={ - "main": SingleAgentRLModuleSpec(), - "random": SingleAgentRLModuleSpec(module_class=RandomRLModule), - } - ), - ) - ) - - # Only for PPO, change the `num_sgd_iter` setting. - if args.algo == "PPO": - config.training(num_sgd_iter=20) - - stop = { - "timesteps_total": args.stop_timesteps, - "training_iteration": args.stop_iters, - "league_size": args.min_league_size, - } - - # Train the "main" policy to play really well using self-play. - results = None - if not args.from_checkpoint: - results = run_rllib_example_script_experiment(config, args, stop) - - # Restore trained Algorithm (set to non-explore behavior) and play against - # human on command line. - if args.num_episodes_human_play > 0: - num_episodes = 0 - config.explore = False - algo = config.build() - if args.from_checkpoint: - algo.restore(args.from_checkpoint) - else: - checkpoint = results.get_best_result().checkpoint - if not checkpoint: - raise ValueError("No last checkpoint found in results!") - algo.restore(checkpoint) - - # Play from the command line against the trained agent - # in an actual (non-RLlib-wrapped) open-spiel env. - human_player = 1 - env = Environment(args.env) - - while num_episodes < args.num_episodes_human_play: - print("You play as {}".format("o" if human_player else "x")) - time_step = env.reset() - while not time_step.last(): - player_id = time_step.observations["current_player"] - if player_id == human_player: - action = ask_user_for_action(time_step) - else: - obs = np.array(time_step.observations["info_state"][player_id]) - action = algo.compute_single_action(obs, policy_id="main") - # In case computer chooses an invalid action, pick a - # random one. - legal = time_step.observations["legal_actions"][player_id] - if action not in legal: - action = np.random.choice(legal) - time_step = env.step([action]) - print(f"\n{env.get_state}") - - print(f"\n{env.get_state}") - - print("End of game!") - if time_step.rewards[human_player] > 0: - print("You win") - elif time_step.rewards[human_player] < 0: - print("You lose") - else: - print("Draw") - # Switch order of players - human_player = 1 - human_player - - num_episodes += 1 - - algo.stop() +raise NotImplementedError(msg) diff --git a/rllib/examples/two_step_game.py b/rllib/examples/two_step_game.py index e783c5a241d5a..175d6ed23c67d 100644 --- a/rllib/examples/two_step_game.py +++ b/rllib/examples/two_step_game.py @@ -1,126 +1,6 @@ -"""The two-step game from QMIX: https://arxiv.org/pdf/1803.11485.pdf - -Configurations you can try: - - normal policy gradients (PG) - - MADDPG - - QMIX - -See also: centralized_critic.py for centralized critic PPO on this game. +msg = """ +This script has been moved to +`ray.rllib.examples.multi_agent_and_self_play.two_step_game.py` """ -import argparse -from gymnasium.spaces import Dict, Tuple, MultiDiscrete -import logging -import os - -import ray -from ray import air, tune -from ray.tune import register_env -from ray.rllib.env.multi_agent_env import ENV_STATE -from ray.rllib.examples.env.two_step_game import TwoStepGame -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import get_trainable_cls - -logger = logging.getLogger(__name__) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--mixer", - type=str, - default="qmix", - choices=["qmix", "vdn", "none"], - help="The mixer model to use.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=200, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=70000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=8.0, help="Reward at which we stop training." -) -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) - - grouping = { - "group_1": [0, 1], - } - obs_space = Tuple( - [ - Dict( - { - "obs": MultiDiscrete([2, 2, 2, 3]), - ENV_STATE: MultiDiscrete([2, 2, 2]), - } - ), - Dict( - { - "obs": MultiDiscrete([2, 2, 2, 3]), - ENV_STATE: MultiDiscrete([2, 2, 2]), - } - ), - ] - ) - act_space = Tuple( - [ - TwoStepGame.action_space, - TwoStepGame.action_space, - ] - ) - register_env( - "grouped_twostep", - lambda config: TwoStepGame(config).with_agent_groups( - grouping, obs_space=obs_space, act_space=act_space - ), - ) - - config = ( - get_trainable_cls(args.run) - .get_default_config() - .environment(TwoStepGame) - .framework(args.framework) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - "episode_reward_mean": args.stop_reward, - "timesteps_total": args.stop_timesteps, - "training_iteration": args.stop_iters, - } - - results = tune.Tuner( - args.run, - run_config=air.RunConfig(stop=stop, verbose=2), - param_space=config, - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/examples/two_trainer_workflow.py b/rllib/examples/two_trainer_workflow.py index a3207fc9e071d..fb1a8c4dff1f5 100644 --- a/rllib/examples/two_trainer_workflow.py +++ b/rllib/examples/two_trainer_workflow.py @@ -1,219 +1,6 @@ -"""Example of using a custom training workflow. - -Here we create a number of CartPole agents, some of which are trained with -DQN, and some of which are trained with PPO. Both are executed concurrently -via a custom training workflow. +msg = """ +This script has been moved to +`ray.rllib.examples._old_api_stack.two_trainer_workflow.py` """ -import argparse -import os - -import ray -from ray import air, tune -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.algorithms.dqn.dqn import DQNConfig -from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy -from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy -from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy -from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.execution.train_ops import train_one_step -from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ( - MultiAgentReplayBuffer, -) -from ray.rllib.examples.env.multi_agent import MultiAgentCartPole -from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ( - NUM_AGENT_STEPS_SAMPLED, - NUM_ENV_STEPS_SAMPLED, - NUM_TARGET_UPDATES, - LAST_TARGET_UPDATE_TS, -) -from ray.rllib.utils.sgd import standardized -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.rllib.utils.typing import ResultDict -from ray.tune.registry import register_env - -parser = argparse.ArgumentParser() -parser.add_argument("--torch", action="store_true") -parser.add_argument("--mixed-torch-tf", action="store_true") -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=600, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train." -) -# 600.0 = 4 (num_agents) x 150.0 -parser.add_argument( - "--stop-reward", type=float, default=600.0, help="Reward at which we stop training." -) - - -# Define new Algorithm with custom `training_step()` method (training workflow). -class MyAlgo(Algorithm): - @override(Algorithm) - def setup(self, config): - # Call super's `setup` to create rollout workers. - super().setup(config) - # Create local replay buffer. - self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, capacity=50000) - - @override(Algorithm) - def training_step(self) -> ResultDict: - # Generate common experiences, collect batch for PPO, store every (DQN) batch - # into replay buffer. - ppo_batches = [] - num_env_steps = 0 - - # PPO batch size fixed at 200. - # TODO: Use `max_env_steps=200` option of synchronous_parallel_sample instead. - while num_env_steps < 200: - ma_batches = synchronous_parallel_sample( - worker_set=self.workers, concat=False - ) - # Loop through ma-batches (which were collected in parallel). - for ma_batch in ma_batches: - # Update sampled counters. - self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count - self._counters[NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps() - ppo_batch = ma_batch.policy_batches.pop("ppo_policy") - # Add collected batches (only for DQN policy) to replay buffer. - self.local_replay_buffer.add(ma_batch) - - ppo_batches.append(ppo_batch) - num_env_steps += ppo_batch.count - - # DQN sub-flow. - dqn_train_results = {} - # Start updating DQN policy once we have some samples in the buffer. - if self._counters[NUM_ENV_STEPS_SAMPLED] > 1000: - # Update DQN policy n times while updating PPO policy once. - for _ in range(10): - dqn_train_batch = self.local_replay_buffer.sample(num_items=64) - dqn_train_results = train_one_step( - self, dqn_train_batch, ["dqn_policy"] - ) - self._counters[ - "agent_steps_trained_DQN" - ] += dqn_train_batch.agent_steps() - print( - "DQN policy learning on samples from", - "agent steps trained", - dqn_train_batch.agent_steps(), - ) - # Update DQN's target net every n train steps (determined by the DQN config). - if ( - self._counters["agent_steps_trained_DQN"] - - self._counters[LAST_TARGET_UPDATE_TS] - >= self.get_policy("dqn_policy").config["target_network_update_freq"] - ): - self.workers.local_worker().get_policy("dqn_policy").update_target() - self._counters[NUM_TARGET_UPDATES] += 1 - self._counters[LAST_TARGET_UPDATE_TS] = self._counters[ - "agent_steps_trained_DQN" - ] - - # PPO sub-flow. - ppo_train_batch = concat_samples(ppo_batches) - self._counters["agent_steps_trained_PPO"] += ppo_train_batch.agent_steps() - # Standardize advantages. - ppo_train_batch[Postprocessing.ADVANTAGES] = standardized( - ppo_train_batch[Postprocessing.ADVANTAGES] - ) - print( - "PPO policy learning on samples from", - "agent steps trained", - ppo_train_batch.agent_steps(), - ) - ppo_train_batch = MultiAgentBatch( - {"ppo_policy": ppo_train_batch}, ppo_train_batch.count - ) - ppo_train_results = train_one_step(self, ppo_train_batch, ["ppo_policy"]) - - # Combine results for PPO and DQN into one results dict. - results = dict(ppo_train_results, **dqn_train_results) - return results - - -if __name__ == "__main__": - args = parser.parse_args() - assert not ( - args.torch and args.mixed_torch_tf - ), "Use either --torch or --mixed-torch-tf, not both!" - - ray.init(local_mode=args.local_mode) - - # Simple environment with 4 independent cartpole entities - register_env( - "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4}) - ) - - # Note that since the algorithm below does not include a default policy or - # policy configs, we have to explicitly set it in the multiagent config: - policies = { - "ppo_policy": ( - PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy, - None, - None, - # Provide entire AlgorithmConfig object, not just an override. - PPOConfig() - .training(num_sgd_iter=10, sgd_minibatch_size=128) - .framework("torch" if args.torch or args.mixed_torch_tf else "tf"), - ), - "dqn_policy": ( - DQNTorchPolicy if args.torch else DQNTFPolicy, - None, - None, - # Provide entire AlgorithmConfig object, not just an override. - DQNConfig().training(target_network_update_freq=500).framework("tf"), - ), - } - - def policy_mapping_fn(agent_id, episode, worker, **kwargs): - if agent_id % 2 == 0: - return "ppo_policy" - else: - return "dqn_policy" - - config = ( - AlgorithmConfig() - # TODO (Kourosh): Migrate this to the new RLModule / Learner API. - .experimental(_enable_new_api_stack=False) - .environment("multi_agent_cartpole") - .framework("torch" if args.torch else "tf") - .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .rollouts(num_rollout_workers=0, rollout_fragment_length=50) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - .reporting(metrics_num_episodes_for_smoothing=30) - ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - results = tune.Tuner( - MyAlgo, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() +raise NotImplementedError(msg) diff --git a/rllib/tests/test_dnc.py b/rllib/tests/test_dnc.py deleted file mode 100644 index 7319bfc13588d..0000000000000 --- a/rllib/tests/test_dnc.py +++ /dev/null @@ -1,83 +0,0 @@ -import gymnasium as gym -import unittest - -import ray -from ray import air -from ray import tune -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.examples.models.neural_computer import DNCMemory -from ray.rllib.utils.framework import try_import_torch - -torch, _ = try_import_torch() - - -class TestDNC(unittest.TestCase): - stop = { - "episode_reward_mean": 100.0, - "timesteps_total": 10000000, - } - - @classmethod - def setUpClass(cls) -> None: - ray.init(num_cpus=4, ignore_reinit_error=True) - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_pack_unpack(self): - d = DNCMemory(gym.spaces.Discrete(1), gym.spaces.Discrete(1), 1, {}, "") - # Add batch dim - packed_state = [m.unsqueeze(0) for m in d.get_initial_state()] - [m.random_() for m in packed_state] - original_packed = [m.clone() for m in packed_state] - - B, T = packed_state[0].shape[:2] - unpacked = d.unpack_state(packed_state) - packed = d.pack_state(*unpacked) - - self.assertTrue(len(packed) > 0) - self.assertEqual(len(packed), len(original_packed)) - - for m_idx in range(len(packed)): - self.assertTrue(torch.all(packed[m_idx] == original_packed[m_idx])) - - def test_dnc_learning(self): - ModelCatalog.register_custom_model("dnc", DNCMemory) - - config = ( - PPOConfig() - .environment(StatelessCartPole) - .framework("torch") - .rollouts(num_envs_per_worker=5, num_rollout_workers=1) - .training( - gamma=0.99, - lr=0.01, - entropy_coeff=0.0005, - vf_loss_coeff=1e-5, - model={ - "custom_model": "dnc", - "max_seq_len": 64, - "custom_model_config": { - "nr_cells": 10, - "cell_size": 8, - }, - }, - ) - .resources(num_cpus_per_worker=2.0) - ) - - tune.Tuner( - "PPO", - param_space=config, - run_config=air.RunConfig(stop=self.stop, verbose=1), - ).fit() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/tests/test_nested_action_spaces.py b/rllib/tests/test_nested_action_spaces.py deleted file mode 100644 index 3635e1b764535..0000000000000 --- a/rllib/tests/test_nested_action_spaces.py +++ /dev/null @@ -1,142 +0,0 @@ -from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple -import numpy as np -import os -import shutil -import tree # pip install dm_tree -import unittest - -import ray -from ray.rllib.algorithms.bc import BC -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.examples.env.random_env import RandomEnv -from ray.rllib.offline.json_reader import JsonReader -from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch -from ray.rllib.utils.test_utils import framework_iterator - -SPACES = { - "dict": Dict( - { - "a": Dict( - { - "aa": Box(-1.0, 1.0, shape=(3,)), - "ab": MultiDiscrete([4, 3]), - } - ), - "b": Discrete(3), - "c": Tuple([Box(0, 10, (2,), dtype=np.int32), Discrete(2)]), - "d": Box(0, 3, (), dtype=np.int64), - } - ), - "tuple": Tuple( - [ - Tuple( - [ - Box(-1.0, 1.0, shape=(2,)), - Discrete(3), - ] - ), - MultiDiscrete([4, 3]), - Dict( - { - "a": Box(0, 100, (), dtype=np.int32), - "b": Discrete(2), - } - ), - ] - ), - "multidiscrete": MultiDiscrete([2, 3, 4]), - "intbox": Box(0, 100, (2,), dtype=np.int32), -} - - -class NestedActionSpacesTest(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init(num_cpus=5) - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_nested_action_spaces(self): - # Write output to check, whether actions are written correctly. - tmp_dir = os.popen("mktemp -d").read()[:-1] - if not os.path.exists(tmp_dir): - # Last resort: Resolve via underlying tempdir (and cut tmp_. - tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:] - assert os.path.exists(tmp_dir), f"'{tmp_dir}' not found!" - - config = ( - PPOConfig() - .environment(RandomEnv) - .rollouts(num_rollout_workers=0) - # Pretend actions in offline files are already normalized. - .offline_data(output=tmp_dir, actions_in_input_normalized=True) - # Switch off OPE as we don't write action-probs. - # TODO: We should probably always write those if `output` is given. - .evaluation(off_policy_estimation_methods={}) - # Remove lr schedule from config, not needed here, and not supported by BC. - .training( - lr_schedule=None, - train_batch_size=20, - sgd_minibatch_size=20, - num_sgd_iter=1, - model={ - "fcnet_hiddens": [10], - }, - ) - ) - - for _ in framework_iterator(config): - for name, action_space in SPACES.items(): - config.environment(env_config={"action_space": action_space}) - - for flatten in [True, False]: - config.experimental(_disable_action_flattening=not flatten) - - print(f"A={action_space} flatten={flatten}") - shutil.rmtree(config["output"]) - algo = config.build() - algo.train() - algo.stop() - - # Check actions in output file (whether properly flattened - # or not). - reader = JsonReader( - inputs=config["output"], - ioctx=algo.workers.local_worker().io_context, - ) - sample_batch = reader.next() - sample_batch = convert_ma_batch_to_sample_batch(sample_batch) - if flatten: - assert isinstance(sample_batch["actions"], np.ndarray) - assert len(sample_batch["actions"].shape) == 2 - assert sample_batch["actions"].shape[0] == len(sample_batch) - else: - tree.assert_same_structure( - algo.get_policy().action_space_struct, - sample_batch["actions"], - ) - - # Test, whether offline data can be properly read by - # BC, configured accordingly. - - # doing this for backwards compatibility until we move to parquet - # as default output - config["input"] = lambda ioctx: JsonReader( - ioctx.config["input_config"]["paths"], ioctx - ) - config["input_config"] = {"paths": config["output"]} - config.output = None - bc = BC(config=config) - bc.train() - bc.stop() - config["output"] = tmp_dir - config["input"] = "sampler" - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/tests/test_perf.py b/rllib/tests/test_perf.py deleted file mode 100644 index b7dbed1ada5d6..0000000000000 --- a/rllib/tests/test_perf.py +++ /dev/null @@ -1,46 +0,0 @@ -import gymnasium as gym -import time -import unittest - -import ray -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.evaluation.rollout_worker import RolloutWorker -from ray.rllib.evaluation.tests.test_rollout_worker import MockPolicy - - -class TestPerf(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init(num_cpus=5) - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - # Tested on Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz - # 11/23/18: Samples per second 8501.125113727468 - # 03/01/19: Samples per second 8610.164353268685 - def test_baseline_performance(self): - for _ in range(20): - ev = RolloutWorker( - env_creator=lambda _: gym.make("CartPole-v1"), - default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( - rollout_fragment_length=100, - num_rollout_workers=0, - ), - ) - start = time.time() - count = 0 - while time.time() - start < 1: - count += ev.sample().count - print() - print("Samples per second {}".format(count / (time.time() - start))) - print() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index a32ed05492075..6e5a53db7ab48 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -599,7 +599,7 @@ def check_learning_achieved( tune_results: "tune.ResultGrid", min_value: float, evaluation: Optional[bool] = None, - metric: str = "episode_reward_mean", + metric: str = "sampler_results/episode_reward_mean", ): """Throws an error if `min_reward` is not reached within tune_results. @@ -1188,9 +1188,11 @@ def should_check_eval(experiment): def run_rllib_example_script_experiment( - config: "AlgorithmConfig", + base_config: "AlgorithmConfig", args: argparse.Namespace, + *, stop: Optional[Dict] = None, + success_metric: str = "sampler_results/episode_reward_mean", ) -> Union[ResultDict, tune.result_grid.ResultGrid]: """Given an algorithm config and some command line args, runs an experiment. @@ -1198,7 +1200,10 @@ def run_rllib_example_script_experiment( It should ideally be generated via the `` Args: - config: The AlgorithmConfig object to use for this experiment. + base_config: The AlgorithmConfig object to use for this experiment. This base + config will be automatically "extended" based on some of the provided + `args`. For example, `args.num_env_runners` is used to set + `config.num_rollout_workers`, etc.. args: A argparse.Namespace object which must have the following properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`, `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for wandb logging: @@ -1212,10 +1217,37 @@ def run_rllib_example_script_experiment( stop = stop or { "training_iteration": args.stop_iters, - "episode_reward_mean": args.stop_reward, + "sampler_results/episode_reward_mean": args.stop_reward, "timesteps_total": args.stop_timesteps, } + from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner + from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner + + # Extend the `base_config` based on provided `args`. + config = ( + base_config.framework(args.framework) + .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .rollouts( + num_rollout_workers=args.num_env_runners, + # Set up the correct env-runner to use depending on + # old-stack/new-stack and multi-agent settings. + env_runner_cls=( + None + if not args.enable_new_api_stack + else SingleAgentEnvRunner + if args.num_agents == 0 + else MultiAgentEnvRunner + ), + ) + .resources( + num_gpus=args.num_gpus, # old stack + num_learner_workers=args.num_gpus, # new stack + num_gpus_per_learner_worker=1 if args.num_gpus else 0, + num_cpus_for_local_worker=1, + ) + ) + if args.no_tune: algo = config.build() for iter in range(args.stop_iters): @@ -1252,23 +1284,50 @@ def run_rllib_example_script_experiment( ) ] + progress_reporter = None + # Use better ProgressReporter for multi-agent cases: List individual policy rewards. + if args.num_agents > 0: + progress_reporter = CLIReporter( + metric_columns={ + **{ + "training_iteration": "iter", + "time_total_s": "total time (s)", + "timesteps_total": "ts", + "sampler_results/episode_reward_mean": "combined reward", + }, + **{ + f"policy_reward_mean/{pid}": f"reward {pid}" + for pid in config.policies + }, + }, + ) + + # Force Tuner to use old progress output as the new one silently ignores our custom + # `CLIReporter`. + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" + results = tune.Tuner( config.algo_class, param_space=config, run_config=air.RunConfig( stop=stop, - verbose=2 if args.verbose else 1, + verbose=args.verbose, callbacks=callbacks, checkpoint_config=air.CheckpointConfig( checkpoint_frequency=args.checkpoint_freq, checkpoint_at_end=args.checkpoint_at_end, ), + progress_reporter=progress_reporter, ), tune_config=tune.TuneConfig(num_samples=args.num_samples), ).fit() if args.as_test: - check_learning_achieved(results, args.stop_reward) + check_learning_achieved( + results, + args.stop_reward, + metric=success_metric, + ) return results